twitter_cldr 3.0.1 → 3.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -2
- data/History.txt +4 -0
- data/README.md +17 -6
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +12 -1
- data/lib/twitter_cldr/resources/regexp_ast_generator.rb +41 -0
- data/lib/twitter_cldr/resources.rb +1 -0
- data/lib/twitter_cldr/shared/postal_code_generator.rb +50 -0
- data/lib/twitter_cldr/shared/postal_codes.rb +48 -9
- data/lib/twitter_cldr/shared.rb +15 -14
- data/lib/twitter_cldr/utils/regexp_ast.rb +115 -0
- data/lib/twitter_cldr/utils/regexp_sampler.rb +149 -0
- data/lib/twitter_cldr/utils.rb +5 -3
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/postal_codes.yml +1442 -159
- data/spec/bidi/bidi_spec.rb +1 -1
- data/spec/collation/collation_spec.rb +1 -1
- data/spec/collation/collator_spec.rb +31 -31
- data/spec/collation/implicit_collation_elements_spec.rb +6 -6
- data/spec/collation/sort_key_builder_spec.rb +28 -26
- data/spec/collation/tailoring_spec.rb +1 -1
- data/spec/collation/trie_builder_spec.rb +16 -16
- data/spec/collation/trie_dumps_spec.rb +2 -2
- data/spec/collation/trie_loader_spec.rb +8 -8
- data/spec/collation/trie_spec.rb +61 -61
- data/spec/collation/trie_with_fallback_spec.rb +5 -5
- data/spec/core_ext_spec.rb +1 -1
- data/spec/data_readers/additional_date_format_selector_spec.rb +38 -38
- data/spec/data_readers/date_time_data_reader_spec.rb +2 -2
- data/spec/data_readers/number_data_reader_spec.rb +1 -1
- data/spec/formatters/calendars/datetime_formatter_spec.rb +218 -218
- data/spec/formatters/list_formatter_spec.rb +8 -8
- data/spec/formatters/numbers/abbreviated/abbreviated_number_formatter_spec.rb +14 -14
- data/spec/formatters/numbers/abbreviated/long_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/abbreviated/short_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/currency_formatter_spec.rb +11 -11
- data/spec/formatters/numbers/decimal_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/fraction_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/integer_spec.rb +16 -16
- data/spec/formatters/numbers/number_formatter_spec.rb +21 -21
- data/spec/formatters/numbers/percent_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/rbnf/rbnf_spec.rb +2 -2
- data/spec/formatters/plurals/plural_formatter_spec.rb +41 -41
- data/spec/formatters/plurals/rules_spec.rb +13 -13
- data/spec/localized/localized_array_spec.rb +12 -12
- data/spec/localized/localized_date_spec.rb +33 -33
- data/spec/localized/localized_datetime_spec.rb +11 -11
- data/spec/localized/localized_hash_spec.rb +4 -4
- data/spec/localized/localized_number_spec.rb +36 -36
- data/spec/localized/localized_object_spec.rb +8 -8
- data/spec/localized/localized_string_spec.rb +53 -53
- data/spec/localized/localized_symbol_spec.rb +9 -9
- data/spec/localized/localized_time_spec.rb +10 -10
- data/spec/localized/localized_timespan_spec.rb +8 -8
- data/spec/normalization_spec.rb +6 -6
- data/spec/parsers/number_parser_spec.rb +36 -36
- data/spec/parsers/parser_spec.rb +5 -5
- data/spec/parsers/segmentation_parser_spec.rb +19 -19
- data/spec/parsers/symbol_table_spec.rb +4 -4
- data/spec/parsers/unicode_regex/character_class_spec.rb +19 -19
- data/spec/parsers/unicode_regex/character_range_spec.rb +1 -1
- data/spec/parsers/unicode_regex/character_set_spec.rb +8 -8
- data/spec/parsers/unicode_regex/literal_spec.rb +5 -5
- data/spec/parsers/unicode_regex/unicode_string_spec.rb +2 -2
- data/spec/parsers/unicode_regex_parser_spec.rb +28 -28
- data/spec/resources/loader_spec.rb +32 -32
- data/spec/shared/break_iterator_spec.rb +13 -13
- data/spec/shared/calendar_spec.rb +59 -59
- data/spec/shared/casefolder_spec.rb +5 -5
- data/spec/shared/code_point_spec.rb +46 -46
- data/spec/shared/currencies_spec.rb +7 -7
- data/spec/shared/language_codes_spec.rb +34 -34
- data/spec/shared/languages_spec.rb +30 -30
- data/spec/shared/numbering_system_spec.rb +7 -7
- data/spec/shared/numbers_spec.rb +4 -4
- data/spec/shared/phone_codes_spec.rb +7 -7
- data/spec/shared/postal_code_generator_spec.rb +76 -0
- data/spec/shared/postal_codes_spec.rb +35 -29
- data/spec/shared/territories_spec.rb +40 -40
- data/spec/shared/unicode_regex_spec.rb +71 -71
- data/spec/spec_helper.rb +2 -2
- data/spec/tokenizers/calendars/date_tokenizer_spec.rb +1 -1
- data/spec/tokenizers/calendars/timespan_tokenizer_spec.rb +6 -6
- data/spec/tokenizers/composite_token_spec.rb +3 -3
- data/spec/tokenizers/token_spec.rb +3 -3
- data/spec/twitter_cldr_spec.rb +72 -72
- data/spec/utils/code_points_spec.rb +10 -10
- data/spec/utils/interpolation_spec.rb +32 -32
- data/spec/utils/range_set_spec.rb +36 -36
- data/spec/utils/regexp_ast_spec.rb +44 -0
- data/spec/utils/regexp_sampler_spec.rb +182 -0
- data/spec/utils/yaml/yaml_spec.rb +23 -23
- data/spec/utils_spec.rb +19 -19
- metadata +263 -258
@@ -10,8 +10,8 @@ include TwitterCldr::Localized
|
|
10
10
|
describe LocalizedTimespan do
|
11
11
|
it "should format a numer of seconds in different units" do
|
12
12
|
timespan = LocalizedTimespan.new(-172800, :locale => :de)
|
13
|
-
timespan.to_s(:unit => :hour).
|
14
|
-
timespan.to_s(:unit => :day).
|
13
|
+
expect(timespan.to_s(:unit => :hour)).to match_normalized("Vor 48 Stunden")
|
14
|
+
expect(timespan.to_s(:unit => :day)).to match_normalized("Vor 2 Tagen")
|
15
15
|
end
|
16
16
|
|
17
17
|
it "approximates timespans accurately if explicity asked" do
|
@@ -37,7 +37,7 @@ describe LocalizedTimespan do
|
|
37
37
|
|
38
38
|
expected.each_pair do |seconds, text|
|
39
39
|
timespan = LocalizedTimespan.new(seconds, :locale => :de)
|
40
|
-
timespan.to_s(options).
|
40
|
+
expect(timespan.to_s(options)).to match_normalized(text)
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
@@ -61,7 +61,7 @@ describe LocalizedTimespan do
|
|
61
61
|
|
62
62
|
expected.each_pair do |seconds, text|
|
63
63
|
timespan = LocalizedTimespan.new(seconds, :locale => :de)
|
64
|
-
timespan.to_s(options).
|
64
|
+
expect(timespan.to_s(options)).to match_normalized(text)
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
@@ -80,7 +80,7 @@ describe LocalizedTimespan do
|
|
80
80
|
}
|
81
81
|
|
82
82
|
expected.each_pair do |unit, text|
|
83
|
-
timespan.to_s(options.merge(:unit => unit)).
|
83
|
+
expect(timespan.to_s(options.merge(:unit => unit))).to match_normalized(text)
|
84
84
|
end
|
85
85
|
end
|
86
86
|
end
|
@@ -99,7 +99,7 @@ describe LocalizedTimespan do
|
|
99
99
|
}
|
100
100
|
|
101
101
|
expected.each_pair do |unit, text|
|
102
|
-
timespan.to_s(:unit => unit).
|
102
|
+
expect(timespan.to_s(:unit => unit)).to match_normalized(text)
|
103
103
|
end
|
104
104
|
end
|
105
105
|
end
|
@@ -118,7 +118,7 @@ describe LocalizedTimespan do
|
|
118
118
|
}
|
119
119
|
|
120
120
|
expected.each_pair do |unit, text|
|
121
|
-
timespan.to_s(:unit => unit).
|
121
|
+
expect(timespan.to_s(:unit => unit)).to match_normalized(text)
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
@@ -137,7 +137,7 @@ describe LocalizedTimespan do
|
|
137
137
|
}
|
138
138
|
|
139
139
|
expected.each_pair do |unit, text|
|
140
|
-
timespan.to_s(options.merge(:unit => unit)).
|
140
|
+
expect(timespan.to_s(options.merge(:unit => unit))).to match_normalized(text)
|
141
141
|
end
|
142
142
|
end
|
143
143
|
end
|
data/spec/normalization_spec.rb
CHANGED
@@ -13,28 +13,28 @@ describe TwitterCldr::Normalization do
|
|
13
13
|
|
14
14
|
it 'it uses NFD by default' do
|
15
15
|
mock(Eprun).normalize(string, :nfd) { normalized_string }
|
16
|
-
TwitterCldr::Normalization.normalize(string).
|
16
|
+
expect(TwitterCldr::Normalization.normalize(string)).to eq(normalized_string)
|
17
17
|
end
|
18
18
|
|
19
19
|
it "uses specified algorithm if there is any" do
|
20
20
|
mock(Eprun).normalize(string, :nfkd) { normalized_string }
|
21
|
-
TwitterCldr::Normalization.normalize(string, :using => :nfkd).
|
21
|
+
expect(TwitterCldr::Normalization.normalize(string, :using => :nfkd)).to eq(normalized_string)
|
22
22
|
end
|
23
23
|
|
24
24
|
it "raises an ArgumentError if passed an unsupported normalizer name" do
|
25
|
-
|
25
|
+
expect do
|
26
26
|
TwitterCldr::Normalization.normalize(string, :using => :blarg)
|
27
|
-
end.
|
27
|
+
end.to raise_error(ArgumentError)
|
28
28
|
end
|
29
29
|
|
30
30
|
it 'accepts normalizer name in upper case' do
|
31
31
|
mock(Eprun).normalize(string, :nfkd) { normalized_string }
|
32
|
-
TwitterCldr::Normalization.normalize(string, :using => :NFKD).
|
32
|
+
expect(TwitterCldr::Normalization.normalize(string, :using => :NFKD)).to eq(normalized_string)
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'accepts a string' do
|
36
36
|
mock(Eprun).normalize(string, :nfkd) { normalized_string }
|
37
|
-
TwitterCldr::Normalization.normalize(string, :using => 'nfkd').
|
37
|
+
expect(TwitterCldr::Normalization.normalize(string, :using => 'nfkd')).to eq(normalized_string)
|
38
38
|
end
|
39
39
|
|
40
40
|
end
|
@@ -16,112 +16,112 @@ describe NumberParser do
|
|
16
16
|
|
17
17
|
describe "#group_separator" do
|
18
18
|
it "returns the correct group separator" do
|
19
|
-
@parser.send(:group_separator).
|
19
|
+
expect(@parser.send(:group_separator)).to match_normalized(" ")
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
23
|
describe "#decimal_separator" do
|
24
24
|
it "returns the correct decimal separator" do
|
25
|
-
@parser.send(:decimal_separator).
|
25
|
+
expect(@parser.send(:decimal_separator)).to eq(",")
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
29
|
describe "#identify" do
|
30
30
|
it "properly identifies a numeric value" do
|
31
|
-
@parser.send(:identify, "7841", *separators).
|
31
|
+
expect(@parser.send(:identify, "7841", *separators)).to eq({ :value => "7841", :type => :numeric })
|
32
32
|
end
|
33
33
|
|
34
34
|
it "properly identifies a decimal separator" do
|
35
|
-
@parser.send(:identify, ",", *separators).
|
35
|
+
expect(@parser.send(:identify, ",", *separators)).to eq({ :value => ",", :type => :decimal })
|
36
36
|
end
|
37
37
|
|
38
38
|
it "properly identifies a group separator" do
|
39
|
-
@parser.send(:identify, ".", *separators).
|
39
|
+
expect(@parser.send(:identify, ".", *separators)).to eq({ :value => ".", :type => :group })
|
40
40
|
end
|
41
41
|
|
42
42
|
it "returns nil if the text doesn't match a number or either separators" do
|
43
|
-
@parser.send(:identify, "abc", *separators).
|
43
|
+
expect(@parser.send(:identify, "abc", *separators)).to eq({ :value => "abc", :type => nil })
|
44
44
|
end
|
45
45
|
end
|
46
46
|
|
47
47
|
describe "#tokenize" do
|
48
48
|
it "splits text by numericality and group/decimal separators" do
|
49
|
-
@parser.send(:tokenize, "1,33.00", *separators).
|
49
|
+
expect(@parser.send(:tokenize, "1,33.00", *separators)).to eq([
|
50
50
|
{ :value => "1", :type => :numeric },
|
51
51
|
{ :value => ",", :type => :decimal },
|
52
52
|
{ :value => "33", :type => :numeric },
|
53
53
|
{ :value => ".", :type => :group },
|
54
54
|
{ :value => "00", :type => :numeric }
|
55
|
-
]
|
55
|
+
])
|
56
56
|
end
|
57
57
|
|
58
58
|
it "returns an empty array for a non-numeric string" do
|
59
|
-
@parser.send(:tokenize, "abc", *separators).
|
59
|
+
expect(@parser.send(:tokenize, "abc", *separators)).to be_empty
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
63
|
describe "#separators" do
|
64
64
|
it "returns all separators when strict mode is off" do
|
65
65
|
group, decimal = @parser.send(:separators, false)
|
66
|
-
group.
|
67
|
-
decimal.
|
66
|
+
expect(group).to eq('\.,\s')
|
67
|
+
expect(decimal).to eq('\.,\s')
|
68
68
|
end
|
69
69
|
|
70
70
|
it "returns only locale-specific separators when strict mode is on" do
|
71
71
|
group, decimal = @parser.send(:separators, true)
|
72
|
-
group.
|
73
|
-
decimal.
|
72
|
+
expect(group).to match_normalized(" ")
|
73
|
+
expect(decimal).to eq(',')
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
77
77
|
describe "#punct_valid" do
|
78
78
|
it "correctly validates a number with no decimal" do
|
79
79
|
tokens = @parser.send(:tokenize, "1.337", *separators).reject { |t| t[:type] == :numeric }
|
80
|
-
@parser.send(:punct_valid?, tokens).
|
80
|
+
expect(@parser.send(:punct_valid?, tokens)).to be_true
|
81
81
|
end
|
82
82
|
|
83
83
|
it "correctly validates a number with a decimal" do
|
84
84
|
tokens = @parser.send(:tokenize, "1.337,00", *separators).reject { |t| t[:type] == :numeric }
|
85
|
-
@parser.send(:punct_valid?, tokens).
|
85
|
+
expect(@parser.send(:punct_valid?, tokens)).to be_true
|
86
86
|
end
|
87
87
|
|
88
88
|
it "reports on an invalid number when it has more than one decimal" do
|
89
89
|
tokens = @parser.send(:tokenize, "1,337,00", *separators).reject { |t| t[:type] == :numeric }
|
90
|
-
@parser.send(:punct_valid?, tokens).
|
90
|
+
expect(@parser.send(:punct_valid?, tokens)).to be_false
|
91
91
|
end
|
92
92
|
end
|
93
93
|
|
94
94
|
describe "#is_numeric?" do
|
95
95
|
it "returns true if the text is numeric" do
|
96
|
-
NumberParser.is_numeric?("4839", "").
|
97
|
-
NumberParser.is_numeric?("1", "").
|
96
|
+
expect(NumberParser.is_numeric?("4839", "")).to be_true
|
97
|
+
expect(NumberParser.is_numeric?("1", "")).to be_true
|
98
98
|
end
|
99
99
|
|
100
100
|
it "returns false if the text is not purely numeric" do
|
101
|
-
NumberParser.is_numeric?("abc", "").
|
102
|
-
NumberParser.is_numeric?("123abc", "").
|
101
|
+
expect(NumberParser.is_numeric?("abc", "")).to be_false
|
102
|
+
expect(NumberParser.is_numeric?("123abc", "")).to be_false
|
103
103
|
end
|
104
104
|
|
105
105
|
it "returns false if the text is blank" do
|
106
|
-
NumberParser.is_numeric?("", "").
|
106
|
+
expect(NumberParser.is_numeric?("", "")).to be_false
|
107
107
|
end
|
108
108
|
|
109
109
|
it "accepts the given characters as valid numerics" do
|
110
|
-
NumberParser.is_numeric?("a123a", "a").
|
111
|
-
NumberParser.is_numeric?("1.234,56").
|
110
|
+
expect(NumberParser.is_numeric?("a123a", "a")).to be_true
|
111
|
+
expect(NumberParser.is_numeric?("1.234,56")).to be_true # default separator chars used here
|
112
112
|
end
|
113
113
|
end
|
114
114
|
|
115
115
|
describe "#valid?" do
|
116
116
|
it "correctly identifies a series of valid cases" do
|
117
117
|
["5", "5,0", "1.337", "1.337,0", "0,05", ",5", "1.337.000,00"].each do |num|
|
118
|
-
@parser.valid?(num).
|
118
|
+
expect(@parser.valid?(num)).to be_true
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
122
122
|
it "correctly identifies a series of invalid cases" do
|
123
123
|
["12,0,0", "5,", "5#{[160].pack("U*")}"].each do |num|
|
124
|
-
@parser.valid?(num).
|
124
|
+
expect(@parser.valid?(num)).to be_false
|
125
125
|
end
|
126
126
|
end
|
127
127
|
end
|
@@ -139,49 +139,49 @@ describe NumberParser do
|
|
139
139
|
}
|
140
140
|
|
141
141
|
cases.each do |text, expected|
|
142
|
-
@parser.parse(text).
|
142
|
+
expect(@parser.parse(text)).to eq(expected)
|
143
143
|
end
|
144
144
|
end
|
145
145
|
|
146
146
|
it "correctly raises an error when asked to parse invalid numbers" do
|
147
147
|
cases = ["12,0,0", "5,", "5#{[160].pack("U*")}"]
|
148
148
|
cases.each do |text|
|
149
|
-
|
149
|
+
expect { @parser.parse(text) }.to raise_error(InvalidNumberError)
|
150
150
|
end
|
151
151
|
end
|
152
152
|
|
153
153
|
context "non-strict" do
|
154
154
|
it "succeeds in parsing even if inexact punctuation is used" do
|
155
|
-
@parser.parse("5 100", :strict => false).
|
155
|
+
expect(@parser.parse("5 100", :strict => false)).to eq(5100)
|
156
156
|
end
|
157
157
|
end
|
158
158
|
end
|
159
159
|
|
160
160
|
describe "#try_parse" do
|
161
161
|
it "parses correctly with a valid number" do
|
162
|
-
@parser.try_parse("1.234").
|
162
|
+
expect(@parser.try_parse("1.234")).to eq(1234)
|
163
163
|
end
|
164
164
|
|
165
165
|
it "parses correctly with a valid number and yields to the given block" do
|
166
166
|
pre_result = nil
|
167
|
-
@parser.try_parse("1.234") do |result|
|
167
|
+
expect(@parser.try_parse("1.234") do |result|
|
168
168
|
pre_result = result
|
169
169
|
9
|
170
|
-
end.
|
171
|
-
pre_result.
|
170
|
+
end).to eq(9)
|
171
|
+
expect(pre_result).to eq(1234)
|
172
172
|
end
|
173
173
|
|
174
174
|
it "falls back on the default value if the number is invalid" do
|
175
|
-
@parser.try_parse("5,").
|
176
|
-
@parser.try_parse("5,", 0).
|
175
|
+
expect(@parser.try_parse("5,")).to be_nil
|
176
|
+
expect(@parser.try_parse("5,", 0)).to eq(0)
|
177
177
|
end
|
178
178
|
|
179
179
|
it "falls back on the block if the number is invalid" do
|
180
|
-
@parser.try_parse("5,") { |result| 9 }.
|
180
|
+
expect(@parser.try_parse("5,") { |result| 9 }).to eq(9)
|
181
181
|
end
|
182
182
|
|
183
183
|
it "doesn't catch anything but an InvalidNumberError" do
|
184
|
-
|
184
|
+
expect { @parser.try_parse(Object.new) }.to raise_error(NoMethodError)
|
185
185
|
end
|
186
186
|
end
|
187
187
|
end
|
data/spec/parsers/parser_spec.rb
CHANGED
@@ -32,9 +32,9 @@ describe Parser do
|
|
32
32
|
it "should reset the token index" do
|
33
33
|
parser.parse(tokens)
|
34
34
|
parser.send(:next_token, :a)
|
35
|
-
parser.send(:current_token).type.
|
35
|
+
expect(parser.send(:current_token).type).to eq(:b)
|
36
36
|
parser.reset
|
37
|
-
parser.send(:current_token).type.
|
37
|
+
expect(parser.send(:current_token).type).to eq(:a)
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
@@ -42,19 +42,19 @@ describe Parser do
|
|
42
42
|
it "should advance to the next token" do
|
43
43
|
parser.parse(tokens)
|
44
44
|
parser.send(:next_token, :a)
|
45
|
-
parser.send(:current_token).type.
|
45
|
+
expect(parser.send(:current_token).type).to eq(:b)
|
46
46
|
end
|
47
47
|
|
48
48
|
it "should raise an error after encountering an unexpected token" do
|
49
49
|
parser.parse(tokens)
|
50
|
-
|
50
|
+
expect { parser.send(:next_token, :z) }.to raise_error(UnexpectedTokenError)
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
54
|
describe "#current_token" do
|
55
55
|
it "returns the current token" do
|
56
56
|
parser.parse(tokens)
|
57
|
-
parser.send(:current_token).type.
|
57
|
+
expect(parser.send(:current_token).type).to eq(:a)
|
58
58
|
end
|
59
59
|
end
|
60
60
|
end
|
@@ -30,21 +30,21 @@ describe "Segmentation" do
|
|
30
30
|
describe "#parse" do
|
31
31
|
it "should parse a rule with a break" do
|
32
32
|
rule = parse(tokenize("[a-z] ÷ [0-9]"))
|
33
|
-
rule.left.to_regexp_str.
|
34
|
-
rule.right.to_regexp_str.
|
35
|
-
rule.boundary_symbol.
|
33
|
+
expect(rule.left.to_regexp_str).to eq("\\A(?:[\\141-\\172])")
|
34
|
+
expect(rule.right.to_regexp_str).to eq("\\A(?:[\\60-\\71])")
|
35
|
+
expect(rule.boundary_symbol).to eq(:break)
|
36
36
|
end
|
37
37
|
|
38
38
|
it "should parse a rule with a non-break" do
|
39
39
|
rule = parse(tokenize("[a-z] × [0-9]"))
|
40
|
-
rule.regex.to_regexp_str.
|
41
|
-
rule.boundary_symbol.
|
40
|
+
expect(rule.regex.to_regexp_str).to eq("\\A(?:[\\141-\\172])(?:[\\60-\\71])")
|
41
|
+
expect(rule.boundary_symbol).to eq(:no_break)
|
42
42
|
end
|
43
43
|
|
44
44
|
it "should parse a rule containing a variable" do
|
45
45
|
rule = parse(tokenize("$FOO × bar"), :symbol_table => symbol_table)
|
46
|
-
rule.regex.to_regexp_str.
|
47
|
-
rule.boundary_symbol.
|
46
|
+
expect(rule.regex.to_regexp_str).to eq("\\A(?:[\\141-\\143])(?:\\142)(?:\\141)(?:\\162)")
|
47
|
+
expect(rule.boundary_symbol).to eq(:no_break)
|
48
48
|
end
|
49
49
|
end
|
50
50
|
end
|
@@ -54,19 +54,19 @@ describe "Segmentation" do
|
|
54
54
|
let(:rule) { parse(tokenize("[a-z] ÷ [0-9]")) }
|
55
55
|
|
56
56
|
it "rule should be the right type" do
|
57
|
-
rule.
|
57
|
+
expect(rule).to be_a(SegmentationParser::BreakRule)
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should match and return the right offset and text" do
|
61
61
|
match = rule.match("c7")
|
62
|
-
match.boundary_offset.
|
63
|
-
match.text.
|
62
|
+
expect(match.boundary_offset).to eq(1)
|
63
|
+
expect(match.text).to eq("c7")
|
64
64
|
end
|
65
65
|
|
66
66
|
it "should not match if the input string doesn't contain a matching right- and/or left-hand side" do
|
67
|
-
rule.match("C7").
|
68
|
-
rule.match("cc").
|
69
|
-
rule.match("CC").
|
67
|
+
expect(rule.match("C7")).to be_nil
|
68
|
+
expect(rule.match("cc")).to be_nil
|
69
|
+
expect(rule.match("CC")).to be_nil
|
70
70
|
end
|
71
71
|
end
|
72
72
|
end
|
@@ -76,20 +76,20 @@ describe "Segmentation" do
|
|
76
76
|
let(:rule) { parse(tokenize("[a-z] × [0-9]")) }
|
77
77
|
|
78
78
|
it "rule should be the right type" do
|
79
|
-
rule.
|
79
|
+
expect(rule).to be_a(SegmentationParser::NoBreakRule)
|
80
80
|
end
|
81
81
|
|
82
82
|
it "should match and return the right offset and text" do
|
83
83
|
match = rule.match("c7")
|
84
84
|
# non-break rules send you to the end of the match (maybe that's wrong?)
|
85
|
-
match.boundary_offset.
|
86
|
-
match.text.
|
85
|
+
expect(match.boundary_offset).to eq(2)
|
86
|
+
expect(match.text).to eq("c7")
|
87
87
|
end
|
88
88
|
|
89
89
|
it "should not match if the input string doesn't contain matching text" do
|
90
|
-
rule.match("C7").
|
91
|
-
rule.match("cc").
|
92
|
-
rule.match("CC").
|
90
|
+
expect(rule.match("C7")).to be_nil
|
91
|
+
expect(rule.match("cc")).to be_nil
|
92
|
+
expect(rule.match("CC")).to be_nil
|
93
93
|
end
|
94
94
|
end
|
95
95
|
end
|
@@ -12,13 +12,13 @@ describe SymbolTable do
|
|
12
12
|
|
13
13
|
describe "#fetch" do
|
14
14
|
it "should be able to retrieve values for symbols" do
|
15
|
-
table.fetch(:a).
|
15
|
+
expect(table.fetch(:a)).to eq("b")
|
16
16
|
fetch = lambda { table.fetch(:z) }
|
17
17
|
|
18
18
|
if RUBY_VERSION > "1.8.7"
|
19
|
-
fetch.
|
19
|
+
expect(fetch).to raise_error(KeyError)
|
20
20
|
else
|
21
|
-
fetch.
|
21
|
+
expect(fetch).to raise_error(IndexError)
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
@@ -26,7 +26,7 @@ describe SymbolTable do
|
|
26
26
|
describe "#add" do
|
27
27
|
it "should be able to add then fetch new values for symbols" do
|
28
28
|
table.add(:e, "f")
|
29
|
-
table.fetch(:e).
|
29
|
+
expect(table.fetch(:e)).to eq("f")
|
30
30
|
end
|
31
31
|
end
|
32
32
|
end
|
@@ -26,22 +26,22 @@ describe UnicodeRegexParser::CharacterClass do
|
|
26
26
|
describe "#to_set" do
|
27
27
|
it "unions together char classes with no explicit operator" do
|
28
28
|
char_class = char_class_from(parse(tokenize("[[a][b]]")))
|
29
|
-
char_class.to_set.to_a.
|
29
|
+
expect(char_class.to_set.to_a).to eq([97..98])
|
30
30
|
end
|
31
31
|
|
32
32
|
it "unions together other entities within char classes when operator is not explicit" do
|
33
33
|
char_class = char_class_from(parse(tokenize("[a-z0-9\\u0123]")))
|
34
|
-
char_class.to_set.to_a(true).
|
34
|
+
expect(char_class.to_set.to_a(true)).to eq([48..57, 97..122, 291])
|
35
35
|
end
|
36
36
|
|
37
37
|
it "intersects correctly" do
|
38
38
|
char_class = char_class_from(parse(tokenize("[[a-m]&[g-z]]")))
|
39
|
-
char_class.to_set.to_a.
|
39
|
+
expect(char_class.to_set.to_a).to eq([103..109])
|
40
40
|
end
|
41
41
|
|
42
42
|
it "finds symmetric differences correctly" do
|
43
43
|
char_class = char_class_from(parse(tokenize("[[a-m]-[g-z]]")))
|
44
|
-
char_class.to_set.to_a.
|
44
|
+
expect(char_class.to_set.to_a).to eq([97..102, 110..122])
|
45
45
|
end
|
46
46
|
|
47
47
|
it "computes sets for nested expressions" do
|
@@ -51,67 +51,67 @@ describe UnicodeRegexParser::CharacterClass do
|
|
51
51
|
# = (104..122) subtr ()
|
52
52
|
# = (104..122)
|
53
53
|
char_class = char_class_from(parse(tokenize("[[[a-m]&[h-j]]-[k-z]]")))
|
54
|
-
char_class.to_set.to_a.
|
54
|
+
expect(char_class.to_set.to_a).to eq([104..122])
|
55
55
|
end
|
56
56
|
|
57
57
|
it "pulls in ranges for unicode character sets" do
|
58
58
|
char_class = char_class_from(parse(tokenize("[\\p{Zs}]")))
|
59
|
-
char_class.to_set.to_a(true).
|
59
|
+
expect(char_class.to_set.to_a(true)).to eq([
|
60
60
|
32, 160, 5760, 6158, 8192..8202, 8239, 8287, 12288
|
61
|
-
]
|
61
|
+
])
|
62
62
|
end
|
63
63
|
|
64
64
|
it "computes unions between unicode character sets" do
|
65
65
|
char_class = char_class_from(parse(tokenize("[[\\p{Zs}][\\p{Cc}]]")))
|
66
|
-
char_class.to_set.to_a(true).
|
66
|
+
expect(char_class.to_set.to_a(true)).to eq([
|
67
67
|
0..1, 8..32, 127..160, 5760, 6158, 8192..8202, 8239, 8287, 12288
|
68
|
-
]
|
68
|
+
])
|
69
69
|
end
|
70
70
|
|
71
71
|
it "computes intersections between unicode character sets" do
|
72
72
|
char_class = char_class_from(parse(tokenize("[[\\p{Zs}]&[\\u2000-\\u202B]]")))
|
73
|
-
char_class.to_set.to_a(true).
|
73
|
+
expect(char_class.to_set.to_a(true)).to eq([8192..8202])
|
74
74
|
end
|
75
75
|
|
76
76
|
it "supports negating character sets" do
|
77
77
|
char_class = char_class_from(parse(tokenize("[^\\u2000-\\u202B]")))
|
78
|
-
char_class.to_set.to_a(true).
|
78
|
+
expect(char_class.to_set.to_a(true)).to eq([
|
79
79
|
0..1, 8..8191, 8236..55295, 57344..1114111
|
80
|
-
]
|
80
|
+
])
|
81
81
|
end
|
82
82
|
|
83
83
|
it "supports literal and escaped characters" do
|
84
84
|
char_class = char_class_from(parse(tokenize("[abc\\edf\\g]")))
|
85
|
-
char_class.to_set.to_a(true).
|
85
|
+
expect(char_class.to_set.to_a(true)).to eq([97..103])
|
86
86
|
end
|
87
87
|
|
88
88
|
it "supports special switch characters" do
|
89
89
|
char_class = char_class_from(parse(tokenize("[\\w]"))) # a-z, A-Z, 0-9, _
|
90
|
-
char_class.to_set.to_a(true).
|
90
|
+
expect(char_class.to_set.to_a(true)).to eq([48..57, 65..90, 95, 97..122])
|
91
91
|
end
|
92
92
|
|
93
93
|
it "supports negated switch characters" do
|
94
94
|
char_class = char_class_from(parse(tokenize("[\\D]"))) # i.e. NOT \w
|
95
|
-
char_class.to_set.to_a(true).
|
95
|
+
expect(char_class.to_set.to_a(true)).to eq([
|
96
96
|
0..1, 8..47, 58..55295, 57344..1114111
|
97
|
-
]
|
97
|
+
])
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
101
101
|
describe "#to_regexp_str" do
|
102
102
|
it "wraps ranges in square brackets" do
|
103
103
|
char_class = char_class_from(parse(tokenize("[a-z]")))
|
104
|
-
char_class.to_regexp_str.
|
104
|
+
expect(char_class.to_regexp_str).to eq("(?:[\\141-\\172])")
|
105
105
|
end
|
106
106
|
|
107
107
|
it "octal-encodes and wraps sequential characters to isolate bytes" do
|
108
108
|
char_class = char_class_from(parse(tokenize("[{foo}]")))
|
109
|
-
char_class.to_regexp_str.
|
109
|
+
expect(char_class.to_regexp_str).to eq("(?:(?:\\146)(?:\\157)(?:\\157))")
|
110
110
|
end
|
111
111
|
|
112
112
|
it "combines multiple components with 'or' pipe characters" do
|
113
113
|
char_class = char_class_from(parse(tokenize("[{foo}abc]")))
|
114
|
-
char_class.to_regexp_str.
|
114
|
+
expect(char_class.to_regexp_str).to eq("(?:(?:\\146)(?:\\157)(?:\\157)|[\\141-\\143])")
|
115
115
|
end
|
116
116
|
end
|
117
117
|
end
|
@@ -11,26 +11,26 @@ describe UnicodeRegexParser::CharacterSet do
|
|
11
11
|
describe "#to_set" do
|
12
12
|
it "should return a set containing codepoints for the given general property" do
|
13
13
|
char_set = UnicodeRegexParser::CharacterSet.new("Zs")
|
14
|
-
char_set.to_set.to_a(true).
|
14
|
+
expect(char_set.to_set.to_a(true)).to eq([
|
15
15
|
32, 160, 5760, 6158, 8192..8202, 8239, 8287, 12288
|
16
|
-
]
|
16
|
+
])
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should return a set containing codepoints for the given named property" do
|
20
20
|
char_set = UnicodeRegexParser::CharacterSet.new("Sentence_Break=Sp")
|
21
|
-
char_set.to_set.to_a(true).
|
21
|
+
expect(char_set.to_set.to_a(true)).to eq([
|
22
22
|
9, 11..12, 32, 160, 5760, 8192..8202, 8239, 8287, 12288
|
23
|
-
]
|
23
|
+
])
|
24
24
|
end
|
25
25
|
|
26
26
|
it "should raise an exception when given an invalid property name or value" do
|
27
|
-
|
27
|
+
expect do
|
28
28
|
UnicodeRegexParser::CharacterSet.new("Foobar=Sp").to_set
|
29
|
-
end.
|
29
|
+
end.to raise_error(UnicodeRegexParserError)
|
30
30
|
|
31
|
-
|
31
|
+
expect do
|
32
32
|
UnicodeRegexParser::CharacterSet.new("Sentence_Break=Foo").to_set
|
33
|
-
end.
|
33
|
+
end.to raise_error(UnicodeRegexParserError)
|
34
34
|
end
|
35
35
|
end
|
36
36
|
end
|
@@ -11,24 +11,24 @@ describe UnicodeRegexParser::Literal do
|
|
11
11
|
describe "#to_set" do
|
12
12
|
it "should return literal characters as codepoints" do
|
13
13
|
literal = UnicodeRegexParser::Literal.new("a")
|
14
|
-
literal.to_set.to_a(true).
|
14
|
+
expect(literal.to_set.to_a(true)).to eq([97])
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should return escaped characters with no special meaning as codepoints" do
|
18
18
|
literal = UnicodeRegexParser::Literal.new("\\a")
|
19
|
-
literal.to_set.to_a(true).
|
19
|
+
expect(literal.to_set.to_a(true)).to eq([97])
|
20
20
|
end
|
21
21
|
|
22
22
|
it "should convert special regex switches to their range equivalents" do
|
23
23
|
literal = UnicodeRegexParser::Literal.new("\\d") # digit
|
24
|
-
literal.to_set.to_a(true).
|
24
|
+
expect(literal.to_set.to_a(true)).to eq([48..57])
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should convert negated special regex switches to their range equivalents" do
|
28
28
|
literal = UnicodeRegexParser::Literal.new("\\D") # NOT digit
|
29
|
-
literal.to_set.to_a(true).
|
29
|
+
expect(literal.to_set.to_a(true)).to eq([
|
30
30
|
0..1, 8..47, 58..55295, 57344..1114111
|
31
|
-
]
|
31
|
+
])
|
32
32
|
end
|
33
33
|
end
|
34
34
|
end
|
@@ -11,12 +11,12 @@ describe UnicodeRegexParser::UnicodeString do
|
|
11
11
|
describe "#to_set" do
|
12
12
|
it "should return a zero-length range when representing a single codepoint" do
|
13
13
|
str = UnicodeRegexParser::UnicodeString.new([97])
|
14
|
-
str.to_set.to_a.
|
14
|
+
expect(str.to_set.to_a).to eq([97..97])
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should return a range containing the codepoint array as both the first and last elements" do
|
18
18
|
str = UnicodeRegexParser::UnicodeString.new([97, 98, 99])
|
19
|
-
str.to_set.to_a.
|
19
|
+
expect(str.to_set.to_a).to eq([[97, 98, 99]..[97, 98, 99]])
|
20
20
|
end
|
21
21
|
end
|
22
22
|
end
|