twitter_cldr 5.1.0 → 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +5 -5
- data/lib/twitter_cldr.rb +1 -0
- data/lib/twitter_cldr/resources.rb +2 -8
- data/lib/twitter_cldr/resources/loader.rb +6 -4
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
- data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
- data/lib/twitter_cldr/segmentation.rb +10 -8
- data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
- data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
- data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
- data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
- data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
- data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
- data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
- data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
- data/lib/twitter_cldr/shared/caser.rb +1 -1
- data/lib/twitter_cldr/shared/locale.rb +6 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/segments/rules/el/sentence.yml +723 -0
- data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
- data/resources/shared/segments/rules/ja/line.yml +964 -0
- data/resources/shared/segments/rules/ja/word.yml +527 -0
- data/resources/shared/segments/rules/root/grapheme.yml +463 -0
- data/resources/shared/segments/rules/root/line.yml +964 -0
- data/resources/shared/segments/rules/root/sentence.yml +723 -0
- data/resources/shared/segments/rules/root/word.yml +527 -0
- data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
- data/resources/shared/segments/rules/zh/line.yml +964 -0
- data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
- data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
- data/resources/shared/segments/tests/line_break_test.yml +7348 -0
- data/resources/uli/segments/de.yml +5 -230
- data/resources/uli/segments/en.yml +3 -154
- data/resources/uli/segments/es.yml +5 -145
- data/resources/uli/segments/fr.yml +5 -68
- data/resources/uli/segments/it.yml +3 -48
- data/resources/uli/segments/pt.yml +5 -173
- data/resources/uli/segments/ru.yml +3 -10
- data/spec/segmentation/rule_set_spec.rb +54 -27
- metadata +29 -9
- data/lib/twitter_cldr/resources/uli.rb +0 -12
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
- data/lib/twitter_cldr/segmentation/parser.rb +0 -71
- data/lib/twitter_cldr/segmentation/rule.rb +0 -79
- data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
- data/resources/shared/segments/segments_root.yml +0 -869
- data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,104 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'spec_helper'
|
7
|
-
|
8
|
-
describe 'Segmentation' do
|
9
|
-
let(:parser) { TwitterCldr::Segmentation::Parser.new }
|
10
|
-
|
11
|
-
def parse(tokens, options = {})
|
12
|
-
parser.parse(tokens, options)
|
13
|
-
end
|
14
|
-
|
15
|
-
describe TwitterCldr::Segmentation::Parser do
|
16
|
-
let(:symbol_table) do
|
17
|
-
TwitterCldr::Parsers::SymbolTable.new({
|
18
|
-
"$FOO" => parser.tokenize_regex("[abc]")
|
19
|
-
})
|
20
|
-
end
|
21
|
-
|
22
|
-
describe "#parse" do
|
23
|
-
it "should parse a rule with a break" do
|
24
|
-
rule = parse("[a-z] ÷ [0-9]")
|
25
|
-
expect(rule.left.to_regexp_str).to eq("(?:[\\u{0061}-\\u{007a}])")
|
26
|
-
expect(rule.right.to_regexp_str).to eq("(?:[\\u{0030}-\\u{0039}])")
|
27
|
-
expect(rule.boundary_symbol).to eq(:break)
|
28
|
-
end
|
29
|
-
|
30
|
-
it "should parse a rule with a non-break" do
|
31
|
-
rule = parse("[a-z] × [0-9]")
|
32
|
-
|
33
|
-
expect(rule.left.to_regexp_str).to eq(
|
34
|
-
"(?:[\\u{0061}-\\u{007a}])"
|
35
|
-
)
|
36
|
-
|
37
|
-
expect(rule.right.to_regexp_str).to eq(
|
38
|
-
"(?:[\\u{0030}-\\u{0039}])"
|
39
|
-
)
|
40
|
-
|
41
|
-
expect(rule.boundary_symbol).to eq(:no_break)
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should parse a rule containing a variable" do
|
45
|
-
rule = parse("$FOO × bar", symbol_table: symbol_table)
|
46
|
-
|
47
|
-
expect(rule.left.to_regexp_str).to eq(
|
48
|
-
"(?:[\\u{0061}-\\u{0063}])"
|
49
|
-
)
|
50
|
-
|
51
|
-
expect(rule.right.to_regexp_str).to eq(
|
52
|
-
"(?:\\u{0062})(?:\\u{0061})(?:\\u{0072})"
|
53
|
-
)
|
54
|
-
|
55
|
-
expect(rule.boundary_symbol).to eq(:no_break)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
describe TwitterCldr::Segmentation::BreakRule do
|
61
|
-
describe "#match" do
|
62
|
-
let(:rule) { parse("[a-z] ÷ [0-9]") }
|
63
|
-
|
64
|
-
it "rule should be the right type" do
|
65
|
-
expect(rule).to be_a(TwitterCldr::Segmentation::BreakRule)
|
66
|
-
end
|
67
|
-
|
68
|
-
it "should match and return the right offset and text" do
|
69
|
-
cursor = TwitterCldr::Segmentation::Cursor.new("c7")
|
70
|
-
match = rule.match(cursor)
|
71
|
-
expect(match.boundary_offset).to eq([0, 2])
|
72
|
-
expect(match.boundary_position).to eq(1)
|
73
|
-
end
|
74
|
-
|
75
|
-
it "should not match if the input string doesn't contain a matching right- and/or left-hand side" do
|
76
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
|
77
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
|
78
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
describe TwitterCldr::Segmentation::NoBreakRule do
|
84
|
-
describe "#match" do
|
85
|
-
let(:rule) { parse("[a-z] × [0-9]") }
|
86
|
-
|
87
|
-
it "rule should be the right type" do
|
88
|
-
expect(rule).to be_a(TwitterCldr::Segmentation::NoBreakRule)
|
89
|
-
end
|
90
|
-
|
91
|
-
it "should match and return the right offset and text" do
|
92
|
-
match = rule.match(TwitterCldr::Segmentation::Cursor.new("c7"))
|
93
|
-
expect(match.boundary_offset).to eq([0, 2])
|
94
|
-
expect(match.boundary_position).to eq(1)
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should not match if the input string doesn't contain matching text" do
|
98
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
|
99
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
|
100
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|