twitter_cldr 5.1.0 → 5.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +5 -5
- data/lib/twitter_cldr.rb +1 -0
- data/lib/twitter_cldr/resources.rb +2 -8
- data/lib/twitter_cldr/resources/loader.rb +6 -4
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
- data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
- data/lib/twitter_cldr/segmentation.rb +10 -8
- data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
- data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
- data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
- data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
- data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
- data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
- data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
- data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
- data/lib/twitter_cldr/shared/caser.rb +1 -1
- data/lib/twitter_cldr/shared/locale.rb +6 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/segments/rules/el/sentence.yml +723 -0
- data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
- data/resources/shared/segments/rules/ja/line.yml +964 -0
- data/resources/shared/segments/rules/ja/word.yml +527 -0
- data/resources/shared/segments/rules/root/grapheme.yml +463 -0
- data/resources/shared/segments/rules/root/line.yml +964 -0
- data/resources/shared/segments/rules/root/sentence.yml +723 -0
- data/resources/shared/segments/rules/root/word.yml +527 -0
- data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
- data/resources/shared/segments/rules/zh/line.yml +964 -0
- data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
- data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
- data/resources/shared/segments/tests/line_break_test.yml +7348 -0
- data/resources/uli/segments/de.yml +5 -230
- data/resources/uli/segments/en.yml +3 -154
- data/resources/uli/segments/es.yml +5 -145
- data/resources/uli/segments/fr.yml +5 -68
- data/resources/uli/segments/it.yml +3 -48
- data/resources/uli/segments/pt.yml +5 -173
- data/resources/uli/segments/ru.yml +3 -10
- data/spec/segmentation/rule_set_spec.rb +54 -27
- metadata +29 -9
- data/lib/twitter_cldr/resources/uli.rb +0 -12
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
- data/lib/twitter_cldr/segmentation/parser.rb +0 -71
- data/lib/twitter_cldr/segmentation/rule.rb +0 -79
- data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
- data/resources/shared/segments/segments_root.yml +0 -869
- data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,104 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'spec_helper'
|
7
|
-
|
8
|
-
describe 'Segmentation' do
|
9
|
-
let(:parser) { TwitterCldr::Segmentation::Parser.new }
|
10
|
-
|
11
|
-
def parse(tokens, options = {})
|
12
|
-
parser.parse(tokens, options)
|
13
|
-
end
|
14
|
-
|
15
|
-
describe TwitterCldr::Segmentation::Parser do
|
16
|
-
let(:symbol_table) do
|
17
|
-
TwitterCldr::Parsers::SymbolTable.new({
|
18
|
-
"$FOO" => parser.tokenize_regex("[abc]")
|
19
|
-
})
|
20
|
-
end
|
21
|
-
|
22
|
-
describe "#parse" do
|
23
|
-
it "should parse a rule with a break" do
|
24
|
-
rule = parse("[a-z] ÷ [0-9]")
|
25
|
-
expect(rule.left.to_regexp_str).to eq("(?:[\\u{0061}-\\u{007a}])")
|
26
|
-
expect(rule.right.to_regexp_str).to eq("(?:[\\u{0030}-\\u{0039}])")
|
27
|
-
expect(rule.boundary_symbol).to eq(:break)
|
28
|
-
end
|
29
|
-
|
30
|
-
it "should parse a rule with a non-break" do
|
31
|
-
rule = parse("[a-z] × [0-9]")
|
32
|
-
|
33
|
-
expect(rule.left.to_regexp_str).to eq(
|
34
|
-
"(?:[\\u{0061}-\\u{007a}])"
|
35
|
-
)
|
36
|
-
|
37
|
-
expect(rule.right.to_regexp_str).to eq(
|
38
|
-
"(?:[\\u{0030}-\\u{0039}])"
|
39
|
-
)
|
40
|
-
|
41
|
-
expect(rule.boundary_symbol).to eq(:no_break)
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should parse a rule containing a variable" do
|
45
|
-
rule = parse("$FOO × bar", symbol_table: symbol_table)
|
46
|
-
|
47
|
-
expect(rule.left.to_regexp_str).to eq(
|
48
|
-
"(?:[\\u{0061}-\\u{0063}])"
|
49
|
-
)
|
50
|
-
|
51
|
-
expect(rule.right.to_regexp_str).to eq(
|
52
|
-
"(?:\\u{0062})(?:\\u{0061})(?:\\u{0072})"
|
53
|
-
)
|
54
|
-
|
55
|
-
expect(rule.boundary_symbol).to eq(:no_break)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
describe TwitterCldr::Segmentation::BreakRule do
|
61
|
-
describe "#match" do
|
62
|
-
let(:rule) { parse("[a-z] ÷ [0-9]") }
|
63
|
-
|
64
|
-
it "rule should be the right type" do
|
65
|
-
expect(rule).to be_a(TwitterCldr::Segmentation::BreakRule)
|
66
|
-
end
|
67
|
-
|
68
|
-
it "should match and return the right offset and text" do
|
69
|
-
cursor = TwitterCldr::Segmentation::Cursor.new("c7")
|
70
|
-
match = rule.match(cursor)
|
71
|
-
expect(match.boundary_offset).to eq([0, 2])
|
72
|
-
expect(match.boundary_position).to eq(1)
|
73
|
-
end
|
74
|
-
|
75
|
-
it "should not match if the input string doesn't contain a matching right- and/or left-hand side" do
|
76
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
|
77
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
|
78
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
describe TwitterCldr::Segmentation::NoBreakRule do
|
84
|
-
describe "#match" do
|
85
|
-
let(:rule) { parse("[a-z] × [0-9]") }
|
86
|
-
|
87
|
-
it "rule should be the right type" do
|
88
|
-
expect(rule).to be_a(TwitterCldr::Segmentation::NoBreakRule)
|
89
|
-
end
|
90
|
-
|
91
|
-
it "should match and return the right offset and text" do
|
92
|
-
match = rule.match(TwitterCldr::Segmentation::Cursor.new("c7"))
|
93
|
-
expect(match.boundary_offset).to eq([0, 2])
|
94
|
-
expect(match.boundary_position).to eq(1)
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should not match if the input string doesn't contain matching text" do
|
98
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("C7"))).to be_nil
|
99
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("cc"))).to be_nil
|
100
|
-
expect(rule.match(TwitterCldr::Segmentation::Cursor.new("CC"))).to be_nil
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|