twitter_cldr_js 2.3.2 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -5
- data/History.txt +8 -0
- data/README.md +69 -1
- data/Rakefile +0 -9
- data/lib/assets/javascripts/twitter_cldr/af.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ar.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/be.js +2044 -144
- data/lib/assets/javascripts/twitter_cldr/bg.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/bn.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ca.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/cs.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/cy.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/da.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/de-CH.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/de.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/el.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-150.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-AU.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-CA.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-GB.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-IE.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-SG.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en-ZA.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/en.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/es-419.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/es-CO.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/es-MX.js +2044 -144
- data/lib/assets/javascripts/twitter_cldr/es-US.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/es.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/eu.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/fa.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/fi.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/fil.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/fr-BE.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/fr-CA.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/fr-CH.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/fr.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ga.js +2044 -144
- data/lib/assets/javascripts/twitter_cldr/gl.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/he.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/hi.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/hr.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/hu.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/id.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/is.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/it-CH.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/it.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/ja.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ko.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/lv.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/msa.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/nl.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/no.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/pl.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/pt.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ro.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/ru.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/sk.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/sq.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/sr.js +2043 -143
- data/lib/assets/javascripts/twitter_cldr/sv.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/ta.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/th.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/tr.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/uk.js +2044 -144
- data/lib/assets/javascripts/twitter_cldr/ur.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/vi.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/zh-cn.js +2042 -142
- data/lib/assets/javascripts/twitter_cldr/zh-tw.js +2042 -142
- data/lib/twitter_cldr/js/compiler.rb +26 -5
- data/lib/twitter_cldr/js/mustache/calendars/datetime.coffee +1 -4
- data/lib/twitter_cldr/js/mustache/numbers/numbers.coffee +10 -4
- data/lib/twitter_cldr/js/mustache/parsers/parser.coffee +32 -0
- data/lib/twitter_cldr/js/mustache/parsers/segmentation_parser.coffee +89 -0
- data/lib/twitter_cldr/js/mustache/parsers/symbol_table.coffee +14 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/character_class.coffee +51 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/character_range.coffee +19 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/character_set.coffee +36 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/component.coffee +48 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/literal.coffee +44 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex/unicode_string.coffee +23 -0
- data/lib/twitter_cldr/js/mustache/parsers/unicode_regex_parser.coffee +189 -0
- data/lib/twitter_cldr/js/mustache/plurals/rules.coffee +7 -5
- data/lib/twitter_cldr/js/mustache/shared/break_iterator.coffee +148 -0
- data/lib/twitter_cldr/js/mustache/shared/code_point.coffee +121 -0
- data/lib/twitter_cldr/js/mustache/shared/unicode_regex.coffee +41 -0
- data/lib/twitter_cldr/js/mustache/tokenizers/composite_token.coffee +11 -0
- data/lib/twitter_cldr/js/mustache/tokenizers/segmentation_tokenizer.coffee +24 -0
- data/lib/twitter_cldr/js/mustache/tokenizers/token.coffee +14 -0
- data/lib/twitter_cldr/js/mustache/tokenizers/tokenizer.coffee +83 -0
- data/lib/twitter_cldr/js/mustache/tokenizers/unicode_regex/unicode_regex_tokenizer.coffee +39 -0
- data/lib/twitter_cldr/js/mustache/utilities.coffee +45 -0
- data/lib/twitter_cldr/js/mustache/utils/code_points.coffee +23 -0
- data/lib/twitter_cldr/js/mustache/utils/range.coffee +16 -0
- data/lib/twitter_cldr/js/mustache/utils/range_set.coffee +195 -0
- data/lib/twitter_cldr/js/renderers.rb +39 -10
- data/lib/twitter_cldr/js/renderers/calendars/timespan_renderer.rb +1 -1
- data/lib/twitter_cldr/js/renderers/numbers/numbers_renderer.rb +16 -9
- data/lib/twitter_cldr/js/renderers/parsers/parser.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/segmentation_parser.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/symbol_table.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/character_class.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/character_range.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/character_set.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/component.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/literal.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex/unicode_string.rb +18 -0
- data/lib/twitter_cldr/js/renderers/parsers/unicode_regex_parser.rb +18 -0
- data/lib/twitter_cldr/js/renderers/plurals/rules/plural_rules_renderer.rb +27 -28
- data/lib/twitter_cldr/js/renderers/shared/break_iterator_renderer.rb +50 -0
- data/lib/twitter_cldr/js/renderers/shared/code_point_renderer.rb +103 -0
- data/lib/twitter_cldr/js/renderers/shared/unicode_regex_renderer.rb +18 -0
- data/lib/twitter_cldr/js/renderers/tokenizers/composite_token.rb +18 -0
- data/lib/twitter_cldr/js/renderers/tokenizers/segmentation_tokenizer.rb +18 -0
- data/lib/twitter_cldr/js/renderers/tokenizers/token.rb +18 -0
- data/lib/twitter_cldr/js/renderers/tokenizers/tokenizer.rb +18 -0
- data/lib/twitter_cldr/js/renderers/tokenizers/unicode_regex/unicode_regex_tokenizer.rb +18 -0
- data/lib/twitter_cldr/js/renderers/utils/code_points.rb +18 -0
- data/lib/twitter_cldr/js/renderers/utils/range.rb +18 -0
- data/lib/twitter_cldr/js/renderers/utils/range_set.rb +18 -0
- data/lib/twitter_cldr/js/tasks/tasks.rb +1 -1
- data/lib/twitter_cldr/js/version.rb +1 -1
- data/spec/js/calendars/datetime.ru.spec.js +17 -0
- data/spec/js/calendars/timespan.ru.spec.js +20 -0
- data/spec/js/numbers/abbreviated/abbreviated_number.spec.js +5 -5
- data/spec/js/numbers/abbreviated/long_decimal.ru.spec.js +24 -0
- data/spec/js/numbers/currency.spec.js +1 -1
- data/spec/js/parsers/parser.spec.js +74 -0
- data/spec/js/parsers/segmentation_parser.spec.js +67 -0
- data/spec/js/parsers/symbol_table.spec.js +20 -0
- data/spec/js/parsers/unicode_regex/character_class.spec.js +121 -0
- data/spec/js/parsers/unicode_regex/character_range.spec.js +17 -0
- data/spec/js/parsers/unicode_regex/character_set.spec.js +17 -0
- data/spec/js/parsers/unicode_regex/literal.spec.js +30 -0
- data/spec/js/parsers/unicode_regex/unicode_string.spec.js +17 -0
- data/spec/js/parsers/unicode_regex_parser.spec.js +76 -0
- data/spec/js/plurals/plural_rules.spec.js +21 -0
- data/spec/js/shared/break_iterator.spec.js +68 -0
- data/spec/js/shared/code_point.spec.js +89 -0
- data/spec/js/shared/unicode_regex.spec.js +201 -0
- data/spec/js/tokenizers/composite_token.spec.js +28 -0
- data/spec/js/tokenizers/segmentation_tokenizer.spec.js +22 -0
- data/spec/js/tokenizers/token.spec.js +25 -0
- data/spec/js/tokenizers/unicode_regex/unicode_regex_tokenizer.spec.js +163 -0
- data/spec/js/utilities.spec.js +47 -0
- data/spec/js/utils/code_points.spec.js +49 -0
- data/spec/js/utils/range_set.spec.js +248 -0
- data/twitter_cldr_js.gemspec +8 -6
- metadata +128 -34
- data/lib/twitter_cldr/js/renderers/plurals/rules/plural_rules_compiler.rb +0 -93
- data/spec/ruby/renderers/plurals/plural_rules_compiler_spec.rb +0 -56
- data/spec/ruby/spec_helper.rb +0 -11
@@ -0,0 +1,23 @@
|
|
1
|
+
# Copyright 2012 Twitter, Inc
|
2
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
3
|
+
|
4
|
+
class TwitterCldr.UnicodeString extends TwitterCldr.Component
|
5
|
+
constructor : (@codepoints) ->
|
6
|
+
super
|
7
|
+
|
8
|
+
to_set : ->
|
9
|
+
# If the number of codepoints is greater than 1, treat them as a
|
10
|
+
# group (eg. multichar string). This is definitely a hack in that
|
11
|
+
# it means there has to be special logic in RangeSet that deals
|
12
|
+
# with data types that aren't true integer ranges. I can't think
|
13
|
+
# of any other way to support multichar strings :(
|
14
|
+
|
15
|
+
if @codepoints.length > 1
|
16
|
+
new TwitterCldr.RangeSet([new TwitterCldr.Range @codepoints, @codepoints])
|
17
|
+
else
|
18
|
+
new TwitterCldr.RangeSet([new TwitterCldr.Range @codepoints[0], @codepoints[0]])
|
19
|
+
|
20
|
+
to_regexp_str : ->
|
21
|
+
cps = (if @codepoints instanceof Array then @codepoints else [@codepoints])
|
22
|
+
@array_to_regex(cps)
|
23
|
+
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# Copyright 2012 Twitter, Inc
|
2
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
3
|
+
|
4
|
+
class TwitterCldr.UnicodeRegexParser extends TwitterCldr.Parser
|
5
|
+
|
6
|
+
constructor : ->
|
7
|
+
# Types that are allowed to be used in character ranges.
|
8
|
+
@character_class_token_types = [
|
9
|
+
"variable", "character_set", "negated_character_set", "unicode_char",
|
10
|
+
"multichar_string", "string", "escaped_character", "character_range"
|
11
|
+
]
|
12
|
+
|
13
|
+
@negated_token_types = [
|
14
|
+
"negated_character_set"
|
15
|
+
]
|
16
|
+
|
17
|
+
@binary_operators = [
|
18
|
+
"pipe", "ampersand", "dash", "union"
|
19
|
+
]
|
20
|
+
|
21
|
+
@unary_operators = [
|
22
|
+
"negate"
|
23
|
+
]
|
24
|
+
|
25
|
+
parse : (tokens, options = {}) ->
|
26
|
+
super(@preprocess(@substitute_variables(tokens, options.symbol_table)), options)
|
27
|
+
|
28
|
+
make_token : (type, value) ->
|
29
|
+
new TwitterCldr.Token ({"type": type, "value" : value})
|
30
|
+
|
31
|
+
# Identifies regex ranges and makes implicit operators explicit
|
32
|
+
preprocess : (tokens) ->
|
33
|
+
result = []
|
34
|
+
i = 0
|
35
|
+
|
36
|
+
while i < tokens.length
|
37
|
+
# Character class entities side-by-side are treated as unions. So
|
38
|
+
# are side-by-side character classes. Add a special placeholder token
|
39
|
+
# to help out the expression parser.
|
40
|
+
add_union = (@is_valid_character_class_token(result[result.length-1]) and tokens[i].type != "close_bracket") ||
|
41
|
+
(result[result.length-1]? and result[result.length-1].type == "close_bracket" and tokens[i].type == "open_bracket")
|
42
|
+
result.push(@make_token("union")) if add_union
|
43
|
+
|
44
|
+
is_range = @is_valid_character_class_token(tokens[i]) and
|
45
|
+
@is_valid_character_class_token(tokens[i + 2]) and
|
46
|
+
tokens[i + 1].type == "dash"
|
47
|
+
if is_range
|
48
|
+
initial = @[tokens[i].type](tokens[i])
|
49
|
+
final = @[tokens[i+2].type](tokens[i+2])
|
50
|
+
result.push(@make_character_range(initial, final))
|
51
|
+
i += 3
|
52
|
+
else
|
53
|
+
if @is_negated_token(tokens[i])
|
54
|
+
result = result.concat [
|
55
|
+
@make_token("open_bracket")
|
56
|
+
@make_token("negate")
|
57
|
+
tokens[i]
|
58
|
+
@make_token("close_bracket")
|
59
|
+
]
|
60
|
+
else
|
61
|
+
result.push(tokens[i])
|
62
|
+
|
63
|
+
i += 1
|
64
|
+
|
65
|
+
result
|
66
|
+
|
67
|
+
substitute_variables : (tokens, symbol_table) ->
|
68
|
+
return tokens unless symbol_table?
|
69
|
+
|
70
|
+
result = []
|
71
|
+
for i in [0...tokens.length] by 1
|
72
|
+
token = tokens[i]
|
73
|
+
if token.type == "variable" and (sub = symbol_table.fetch(token.value))?
|
74
|
+
# variables can themselves contain references to other variables
|
75
|
+
# note: this could be cached somehow
|
76
|
+
result = result.concat(@substitute_variables(sub, symbol_table))
|
77
|
+
else
|
78
|
+
result.push token
|
79
|
+
|
80
|
+
result
|
81
|
+
|
82
|
+
make_character_range : (initial, final) ->
|
83
|
+
new TwitterCldr.CharacterRange(initial, final)
|
84
|
+
|
85
|
+
is_negated_token : (token) ->
|
86
|
+
token? and token.type in @negated_token_types
|
87
|
+
|
88
|
+
is_valid_character_class_token : (token) ->
|
89
|
+
token? and token.type in @character_class_token_types
|
90
|
+
|
91
|
+
is_unary_operator : (token) ->
|
92
|
+
token? and token.type in @unary_operators
|
93
|
+
|
94
|
+
is_binary_operator : (token) ->
|
95
|
+
token? and token.type in @binary_operators
|
96
|
+
|
97
|
+
do_parse : (options) ->
|
98
|
+
elements = []
|
99
|
+
while @current_token()
|
100
|
+
switch @current_token().type
|
101
|
+
when "open_bracket"
|
102
|
+
elements.push(@character_class())
|
103
|
+
when "union"
|
104
|
+
@next_token("union")
|
105
|
+
else
|
106
|
+
elements.push (@[@current_token().type](@current_token()))
|
107
|
+
@next_token(@current_token().type)
|
108
|
+
elements
|
109
|
+
|
110
|
+
character_set : (token) ->
|
111
|
+
new TwitterCldr.CharacterSet(token.value.replace(/^\\p/g, "").replace(/[\{\}\[\]:]/g, ""))
|
112
|
+
|
113
|
+
negated_character_set : (token) ->
|
114
|
+
new TwitterCldr.CharacterSet(token.value.replace(/^\\[pP]/g, "").replace(/[\{\}\[\]:^]/g, ""))
|
115
|
+
|
116
|
+
unicode_char : (token) ->
|
117
|
+
new TwitterCldr.UnicodeString([parseInt(token.value.replace(/^\\u/g, "").replace(/[\{\}]/g, ""), 16)])
|
118
|
+
|
119
|
+
string : (token) ->
|
120
|
+
new TwitterCldr.UnicodeString(TwitterCldr.Utilities.unpack_string(token.value))
|
121
|
+
|
122
|
+
multichar_string : (token) ->
|
123
|
+
new TwitterCldr.UnicodeString(TwitterCldr.Utilities.unpack_string(token.value.replace(/[\{\}]/g, "")))
|
124
|
+
|
125
|
+
escaped_character : (token) ->
|
126
|
+
new TwitterCldr.Literal(token.value)
|
127
|
+
|
128
|
+
special_char : (token) ->
|
129
|
+
new TwitterCldr.Literal(token.value)
|
130
|
+
|
131
|
+
negate : (token) ->
|
132
|
+
@special_char(token)
|
133
|
+
|
134
|
+
pipe : (token) ->
|
135
|
+
@special_char(token)
|
136
|
+
|
137
|
+
ampersand : (token) ->
|
138
|
+
@special_char(token)
|
139
|
+
|
140
|
+
|
141
|
+
# current_token is already a CharacterRange object
|
142
|
+
character_range : (token) ->
|
143
|
+
token
|
144
|
+
|
145
|
+
character_class : ->
|
146
|
+
operator_stack = []
|
147
|
+
operand_stack = []
|
148
|
+
open_count = 0
|
149
|
+
|
150
|
+
while true
|
151
|
+
if @current_token().type in TwitterCldr.CharacterClass.closing_types()
|
152
|
+
last_operator = @peek(operator_stack)
|
153
|
+
open_count -= 1
|
154
|
+
while last_operator.type isnt TwitterCldr.CharacterClass.opening_type_for(@current_token().type)
|
155
|
+
operator = operator_stack.pop()
|
156
|
+
node = if @is_unary_operator(operator)
|
157
|
+
@unary_operator_node(operator.type, operand_stack.pop())
|
158
|
+
else
|
159
|
+
@binary_operator_node(operator.type, operand_stack.pop(), operand_stack.pop())
|
160
|
+
|
161
|
+
operand_stack.push(node)
|
162
|
+
last_operator = @peek(operator_stack)
|
163
|
+
|
164
|
+
operator_stack.pop()
|
165
|
+
|
166
|
+
else if @current_token().type in TwitterCldr.CharacterClass.opening_types()
|
167
|
+
open_count += 1
|
168
|
+
operator_stack.push(@current_token())
|
169
|
+
|
170
|
+
else if @current_token().type in @unary_operators.concat(@binary_operators)
|
171
|
+
operator_stack.push(@current_token())
|
172
|
+
|
173
|
+
else
|
174
|
+
operand_stack.push(@[@current_token().type](@current_token()))
|
175
|
+
|
176
|
+
@next_token(@current_token().type)
|
177
|
+
|
178
|
+
break if operator_stack.length is 0 and open_count is 0
|
179
|
+
|
180
|
+
new TwitterCldr.CharacterClass(operand_stack.pop())
|
181
|
+
|
182
|
+
peek : (array) ->
|
183
|
+
array[array.length-1]
|
184
|
+
|
185
|
+
binary_operator_node : (operator, right, left) ->
|
186
|
+
new TwitterCldr.CharacterClass.BinaryOperator(operator, left, right)
|
187
|
+
|
188
|
+
unary_operator_node : (operator, child) ->
|
189
|
+
new TwitterCldr.CharacterClass.UnaryOperator(operator, child)
|
@@ -3,12 +3,14 @@
|
|
3
3
|
|
4
4
|
class TwitterCldr.PluralRules
|
5
5
|
@rules = `{{{rules}}}`
|
6
|
+
@runtime = `{{{runtime}}}`
|
7
|
+
@names = {{{names}}}
|
6
8
|
|
7
|
-
@all: ->
|
8
|
-
return @
|
9
|
+
@all: (type = 'cardinal') ->
|
10
|
+
return @names[type]
|
9
11
|
|
10
|
-
@rule_for: (number) ->
|
12
|
+
@rule_for: (number, type = 'cardinal') ->
|
11
13
|
try
|
12
|
-
return @rules.
|
14
|
+
return @rules[type](number.toString(), @runtime)
|
13
15
|
catch error
|
14
|
-
return "other"
|
16
|
+
return "other"
|
@@ -0,0 +1,148 @@
|
|
1
|
+
# Copyright 2012 Twitter, Inc
|
2
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
3
|
+
|
4
|
+
class TwitterCldr.BreakIterator
|
5
|
+
constructor : (locale = TwitterCldr.locale, options = {}) ->
|
6
|
+
@locale = locale
|
7
|
+
@use_uli_exceptions = (if options["use_uli_exceptions"]? then options["use_uli_exceptions"] else true)
|
8
|
+
@exceptions_cache = {}
|
9
|
+
@segmentation_tokenizer = new TwitterCldr.SegmentationTokenizer()
|
10
|
+
@segmentation_parser = new TwitterCldr.SegmentationParser()
|
11
|
+
@tailoring_resource_data = `{{{tailoring_resource_data}}}`
|
12
|
+
@exceptions_resource_data = `{{{exceptions_resource_data}}}`
|
13
|
+
@root_resource = `{{{root_resource_data}}}`
|
14
|
+
|
15
|
+
each_sentence : (str, block) ->
|
16
|
+
@each_boundary(str, "sentence", block)
|
17
|
+
|
18
|
+
each_word : (str, block) ->
|
19
|
+
throw "Word segmentation is not currently supported."
|
20
|
+
|
21
|
+
each_line : (str, block) ->
|
22
|
+
throw "Line segmentation is not currently supported."
|
23
|
+
|
24
|
+
boundary_name_for: (str) ->
|
25
|
+
str.replace(/(?:^|\_)([A-Za-z])/, (match) ->
|
26
|
+
match.toUpperCase()
|
27
|
+
) + "Break"
|
28
|
+
|
29
|
+
each_boundary : (str, boundary_type, block) ->
|
30
|
+
rules = @compile_rules_for(@locale, boundary_type)
|
31
|
+
match = null
|
32
|
+
last_offset = 0
|
33
|
+
current_position = 0
|
34
|
+
search_str = str
|
35
|
+
result = []
|
36
|
+
|
37
|
+
while(search_str.length isnt 0)
|
38
|
+
rule = null
|
39
|
+
for r in rules
|
40
|
+
match = r.match(search_str)
|
41
|
+
if match?
|
42
|
+
rule = r
|
43
|
+
break
|
44
|
+
if rule.boundary_symbol is "break"
|
45
|
+
break_offset = current_position + match.boundary_offset
|
46
|
+
result.push(str.slice(last_offset, break_offset))
|
47
|
+
if block?
|
48
|
+
block(result[result.length-1])
|
49
|
+
|
50
|
+
last_offset = break_offset
|
51
|
+
|
52
|
+
search_str = search_str.slice(match.boundary_offset)
|
53
|
+
current_position += match.boundary_offset
|
54
|
+
|
55
|
+
if last_offset < str.length - 1
|
56
|
+
result.push(str.slice(last_offset))
|
57
|
+
if block?
|
58
|
+
block(str.slice(last_offset))
|
59
|
+
|
60
|
+
|
61
|
+
result
|
62
|
+
|
63
|
+
compile_exception_rule_for : (locale, boundary_type, boundary_name) ->
|
64
|
+
if boundary_type is "sentence"
|
65
|
+
cache_key = TwitterCldr.Utilities.compute_cache_key([locale, boundary_type])
|
66
|
+
|
67
|
+
result = null
|
68
|
+
exceptions = @exceptions_for(locale, boundary_name)
|
69
|
+
regex_contents = (TwitterCldr.Utilities.regex_escape(exception) for exception in exceptions).join("|")
|
70
|
+
@exceptions_cache[cache_key] ||= @segmentation_parser.parse (
|
71
|
+
@segmentation_tokenizer.tokenize("(?:"+regex_contents+") \u00D7")
|
72
|
+
)
|
73
|
+
|
74
|
+
# Grabs rules from segment_root, applies custom tailorings (our own, NOT from CLDR),
|
75
|
+
# and optionally integrates ULI exceptions.
|
76
|
+
compile_rules_for : (locale, boundary_type) ->
|
77
|
+
boundary_name = @boundary_name_for(boundary_type)
|
78
|
+
boundary_data = @resource_for(boundary_name)
|
79
|
+
symbol_table = @symbol_table_for(boundary_data)
|
80
|
+
root_rules = @rules_for(boundary_data, symbol_table)
|
81
|
+
tailoring_boundary_data = @tailoring_resource_for(locale, boundary_name)
|
82
|
+
tailoring_rules = @rules_for(tailoring_boundary_data, symbol_table)
|
83
|
+
rules = @merge_rules(root_rules, tailoring_rules)
|
84
|
+
|
85
|
+
if @use_uli_exceptions is true
|
86
|
+
exception_rule = @compile_exception_rule_for(locale, boundary_type, boundary_name)
|
87
|
+
rules.unshift(exception_rule)
|
88
|
+
|
89
|
+
rules
|
90
|
+
|
91
|
+
# replaces ruleset1's rules with rules with the same id from ruleset2
|
92
|
+
merge_rules : (ruleset1, ruleset2) ->
|
93
|
+
result = []
|
94
|
+
TwitterCldr.Utilities.arraycopy ruleset1, 0, result, 0, ruleset1.length
|
95
|
+
|
96
|
+
for i in [0...ruleset2.length] by 1
|
97
|
+
for j in [0...result.length] by 1
|
98
|
+
if ruleset2[i].id == result[j].id
|
99
|
+
result[j] = ruleset2[i]
|
100
|
+
|
101
|
+
result
|
102
|
+
|
103
|
+
symbol_table_for : (boundary_data) ->
|
104
|
+
table = new TwitterCldr.SymbolTable()
|
105
|
+
|
106
|
+
for i in [0...boundary_data.variables.length] by 1
|
107
|
+
variable = boundary_data.variables[i]
|
108
|
+
id = variable.id.toString()
|
109
|
+
tokens = @segmentation_tokenizer.tokenize(variable.value)
|
110
|
+
# note: variables can be redefined (add replaces if key already exists)
|
111
|
+
table.add(id, @resolve_symbols(tokens, table))
|
112
|
+
|
113
|
+
table
|
114
|
+
|
115
|
+
resolve_symbols : (tokens, symbol_table) ->
|
116
|
+
result = []
|
117
|
+
|
118
|
+
for i in [0...tokens.length]
|
119
|
+
token = tokens[i]
|
120
|
+
if token.type == "variable"
|
121
|
+
result = result.concat(symbol_table.fetch(token.value))
|
122
|
+
else
|
123
|
+
result.push(token)
|
124
|
+
|
125
|
+
result
|
126
|
+
|
127
|
+
rules_for : (boundary_data, symbol_table) ->
|
128
|
+
results = []
|
129
|
+
for rule in boundary_data.rules
|
130
|
+
r = @segmentation_parser.parse(
|
131
|
+
@segmentation_tokenizer.tokenize(rule.value), {"symbol_table" : symbol_table}
|
132
|
+
)
|
133
|
+
r.string = rule.value
|
134
|
+
r.id = rule.id
|
135
|
+
results.push(r)
|
136
|
+
|
137
|
+
results
|
138
|
+
|
139
|
+
|
140
|
+
resource_for : (boundary_name) ->
|
141
|
+
@root_resource["segments"][boundary_name]
|
142
|
+
|
143
|
+
tailoring_resource_for : (locale, boundary_name) ->
|
144
|
+
@tailoring_resource_data[locale][locale]["segments"][boundary_name]
|
145
|
+
|
146
|
+
exceptions_for : (locale, boundary_name) ->
|
147
|
+
result = @exceptions_resource_data[locale][locale]["exceptions"]
|
148
|
+
if result? then result else []
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# Copyright 2012 Twitter, Inc
|
2
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
3
|
+
|
4
|
+
class TwitterCldr.CodePoint
|
5
|
+
@code_point_fields = [
|
6
|
+
"code_point"
|
7
|
+
"name"
|
8
|
+
"category"
|
9
|
+
"combining_class"
|
10
|
+
"bidi_class"
|
11
|
+
"decomposition"
|
12
|
+
"digit_value"
|
13
|
+
"non_decimal_digit_value"
|
14
|
+
"numeric_value"
|
15
|
+
"bidi_mirrored"
|
16
|
+
"unicode1_name"
|
17
|
+
"iso_comment"
|
18
|
+
"simple_uppercase_map"
|
19
|
+
"simple_lowercase_map"
|
20
|
+
"simple_titlecase_map"
|
21
|
+
]
|
22
|
+
|
23
|
+
decomposition_data_index = 5
|
24
|
+
decomposition_regex = /^(?:<(.+)>\s+)?(.+)?$/
|
25
|
+
@indices = ["category", "bidi_class", "bidi_mirrored"]
|
26
|
+
@properties = ["sentence_break", "line_break", "word_break"]
|
27
|
+
|
28
|
+
constructor : (@fields) ->
|
29
|
+
for i in [0...TwitterCldr.CodePoint.code_point_fields.length] by 1
|
30
|
+
field = TwitterCldr.CodePoint.code_point_fields[i]
|
31
|
+
unless field is "decomposition"
|
32
|
+
@[field] = @fields[i]
|
33
|
+
|
34
|
+
|
35
|
+
decomposition : ->
|
36
|
+
decomp = @fields[decomposition_data_index]
|
37
|
+
match = decomp.match(decomposition_regex)
|
38
|
+
if match?
|
39
|
+
if match[2]?
|
40
|
+
return (parseInt(s, 16) for s in match[2].match(/\S+/g))
|
41
|
+
else
|
42
|
+
return null
|
43
|
+
else
|
44
|
+
throw "decomposition " + decomp + " has invalid format"
|
45
|
+
|
46
|
+
compatibility_decomposition_tag : ->
|
47
|
+
decomp = @fields[decomposition_data_index]
|
48
|
+
if (match = decomp.match(decomposition_regex))
|
49
|
+
if match[1]? then return match[1] else return null
|
50
|
+
else
|
51
|
+
throw "decomposition " + decomp + " has invalid format"
|
52
|
+
|
53
|
+
is_compatibility_decomposition : ->
|
54
|
+
return @compatibility_decomposition_tag()?
|
55
|
+
|
56
|
+
@code_points_for_property : (property_name, value) ->
|
57
|
+
property_data = @get_property_data(property_name)
|
58
|
+
if property_data?
|
59
|
+
property_data[value]
|
60
|
+
else
|
61
|
+
throw "Couldn't find property " + property_name
|
62
|
+
|
63
|
+
# Search for code points wherein at least one property value contains prop_value.
|
64
|
+
# For example, if prop_value is set to "Zs", this method will return all code
|
65
|
+
# points that are considered spaces. If prop value is simply "Z", this method
|
66
|
+
# will return all code points who have a property value that contains "Z", i.e.
|
67
|
+
# spaces as well as line separators ("Zl") and paragraph separators ("Zp").
|
68
|
+
@code_points_for_property_value : (prop_value) ->
|
69
|
+
if @index_key_cache[prop_value]?
|
70
|
+
return @index_key_cache[prop_value]
|
71
|
+
|
72
|
+
result = []
|
73
|
+
for index_key, index_names of @index_keys
|
74
|
+
if index_key.indexOf(prop_value) > -1
|
75
|
+
for index_name in index_names
|
76
|
+
result = result.concat(@get_index(index_name)[index_key])
|
77
|
+
|
78
|
+
@index_key_cache[prop_value] = result
|
79
|
+
|
80
|
+
@index_key_cache = {}
|
81
|
+
|
82
|
+
@index_keys = `{{{index_keys}}}`
|
83
|
+
|
84
|
+
@index_data = `{{{index_data}}}`
|
85
|
+
|
86
|
+
@get_index : (index_name) ->
|
87
|
+
return @index_cache[index_name] if @index_cache[index_name]?
|
88
|
+
index_data = @index_data[index_name]
|
89
|
+
index_data_formatted = {}
|
90
|
+
for k, v of index_data
|
91
|
+
index_data_formatted[k] = []
|
92
|
+
for range in index_data[k]
|
93
|
+
index_data_formatted[k].push(new TwitterCldr.Range(range[0], range[1]))
|
94
|
+
|
95
|
+
@index_cache[index_name] = index_data_formatted
|
96
|
+
|
97
|
+
@property_data = `{{{property_data}}}`
|
98
|
+
|
99
|
+
@get_property_data : (property_name) ->
|
100
|
+
return @property_data_cache[property_name] if @property_data_cache[property_name]?
|
101
|
+
property_data = @property_data[property_name]
|
102
|
+
property_data_formatted = {}
|
103
|
+
for k, v of property_data
|
104
|
+
property_data_formatted[k] = []
|
105
|
+
for range in property_data[k]
|
106
|
+
property_data_formatted[k].push(new TwitterCldr.Range(range[0], range[1]))
|
107
|
+
|
108
|
+
@property_data_cache[property_name] = property_data_formatted
|
109
|
+
|
110
|
+
@index_cache = {}
|
111
|
+
|
112
|
+
@property_data_cache = {}
|
113
|
+
|
114
|
+
@get_block_name : (code_point) ->
|
115
|
+
if @block_cache[code_point]?
|
116
|
+
return @block_cache[code_point]
|
117
|
+
for k, range of @blocks
|
118
|
+
range = new TwitterCldr.Range(range[0], range[1])
|
119
|
+
if range.includes(code_point)
|
120
|
+
return @block_cache[code_point] = k
|
121
|
+
return null
|