twitter_cldr 5.1.0 → 5.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +5 -5
- data/lib/twitter_cldr.rb +1 -0
- data/lib/twitter_cldr/resources.rb +2 -8
- data/lib/twitter_cldr/resources/loader.rb +6 -4
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
- data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
- data/lib/twitter_cldr/segmentation.rb +10 -8
- data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
- data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
- data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
- data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
- data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
- data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
- data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
- data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
- data/lib/twitter_cldr/shared/caser.rb +1 -1
- data/lib/twitter_cldr/shared/locale.rb +6 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/segments/rules/el/sentence.yml +723 -0
- data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
- data/resources/shared/segments/rules/ja/line.yml +964 -0
- data/resources/shared/segments/rules/ja/word.yml +527 -0
- data/resources/shared/segments/rules/root/grapheme.yml +463 -0
- data/resources/shared/segments/rules/root/line.yml +964 -0
- data/resources/shared/segments/rules/root/sentence.yml +723 -0
- data/resources/shared/segments/rules/root/word.yml +527 -0
- data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
- data/resources/shared/segments/rules/zh/line.yml +964 -0
- data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
- data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
- data/resources/shared/segments/tests/line_break_test.yml +7348 -0
- data/resources/uli/segments/de.yml +5 -230
- data/resources/uli/segments/en.yml +3 -154
- data/resources/uli/segments/es.yml +5 -145
- data/resources/uli/segments/fr.yml +5 -68
- data/resources/uli/segments/it.yml +3 -48
- data/resources/uli/segments/pt.yml +5 -173
- data/resources/uli/segments/ru.yml +3 -10
- data/spec/segmentation/rule_set_spec.rb +54 -27
- metadata +29 -9
- data/lib/twitter_cldr/resources/uli.rb +0 -12
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
- data/lib/twitter_cldr/segmentation/parser.rb +0 -71
- data/lib/twitter_cldr/segmentation/rule.rb +0 -79
- data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
- data/resources/shared/segments/segments_root.yml +0 -869
- data/spec/segmentation/parser_spec.rb +0 -104
@@ -6,10 +6,12 @@
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Segmentation
|
8
8
|
class Cursor
|
9
|
-
attr_reader :text, :
|
9
|
+
attr_reader :text, :codepoints
|
10
|
+
attr_accessor :position
|
10
11
|
|
11
12
|
def initialize(text)
|
12
13
|
@text = text
|
14
|
+
@codepoints = text.codepoints
|
13
15
|
reset
|
14
16
|
end
|
15
17
|
|
@@ -19,15 +21,18 @@ module TwitterCldr
|
|
19
21
|
|
20
22
|
def reset
|
21
23
|
@position = 0
|
22
|
-
@match_cache = {}
|
23
24
|
end
|
24
25
|
|
25
|
-
def
|
26
|
+
def eos?
|
26
27
|
position >= text.size
|
27
28
|
end
|
28
29
|
|
29
|
-
def
|
30
|
-
|
30
|
+
def codepoint(pos = position)
|
31
|
+
codepoints[pos]
|
32
|
+
end
|
33
|
+
|
34
|
+
def length
|
35
|
+
text.length
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Segmentation
|
8
|
+
class Metadata
|
9
|
+
attr_reader :values
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@values = values
|
13
|
+
end
|
14
|
+
|
15
|
+
def category_count
|
16
|
+
@category_count ||= values[:category_count]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class NullSuppressions
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
def should_break?(_cursor)
|
14
|
+
true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -8,109 +8,53 @@ module TwitterCldr
|
|
8
8
|
class RuleSet
|
9
9
|
|
10
10
|
class << self
|
11
|
-
def
|
12
|
-
|
11
|
+
def create(locale, boundary_type, options = {})
|
12
|
+
new(locale, StateMachine.instance(boundary_type, locale), options)
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
attr_reader :locale, :
|
16
|
+
attr_reader :locale, :state_machine
|
17
17
|
attr_accessor :use_uli_exceptions
|
18
18
|
|
19
19
|
alias_method :use_uli_exceptions?, :use_uli_exceptions
|
20
20
|
|
21
|
-
def initialize(locale,
|
21
|
+
def initialize(locale, state_machine, options)
|
22
22
|
@locale = locale
|
23
|
-
@
|
24
|
-
@boundary_type = boundary_type
|
23
|
+
@state_machine = state_machine
|
25
24
|
@use_uli_exceptions = options.fetch(
|
26
25
|
:use_uli_exceptions, false
|
27
26
|
)
|
28
27
|
end
|
29
28
|
|
30
29
|
def each_boundary(str)
|
31
|
-
|
32
|
-
cursor = Cursor.new(str)
|
33
|
-
last_boundary = 0
|
30
|
+
return to_enum(__method__, str) unless block_given?
|
34
31
|
|
35
|
-
|
36
|
-
yield 0
|
32
|
+
cursor = Cursor.new(str)
|
37
33
|
|
38
|
-
|
39
|
-
|
40
|
-
|
34
|
+
# Let the state machine find the first boundary for the line
|
35
|
+
# boundary type. This helps pass nearly all the Unicode
|
36
|
+
# segmentation tests, so it must be the right thing to do.
|
37
|
+
# Normally the first boundary is the implicit start of text
|
38
|
+
# boundary, but potentially not for the line rules?
|
39
|
+
yield 0 unless state_machine.boundary_type == 'line'
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
if match.boundary_position == cursor.position
|
48
|
-
cursor.advance
|
49
|
-
else
|
50
|
-
cursor.advance(
|
51
|
-
match.boundary_position - cursor.position
|
52
|
-
)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# implicit end of text boundary
|
57
|
-
yield str.size unless last_boundary == str.size
|
58
|
-
else
|
59
|
-
to_enum(__method__, str)
|
41
|
+
until cursor.eos?
|
42
|
+
state_machine.handle_next(cursor)
|
43
|
+
yield cursor.position if suppressions.should_break?(cursor)
|
60
44
|
end
|
61
45
|
end
|
62
46
|
|
63
|
-
|
64
|
-
|
65
|
-
def each_rule(&block)
|
66
|
-
if block_given?
|
67
|
-
if use_uli_exceptions? && supports_exceptions?
|
68
|
-
yield exception_rule
|
69
|
-
end
|
70
|
-
|
71
|
-
rules.each(&block)
|
72
|
-
else
|
73
|
-
to_enum(__method__)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def exception_rule
|
78
|
-
@exception_rule ||= RuleSetBuilder.exception_rule_for(
|
79
|
-
locale, boundary_type
|
80
|
-
)
|
81
|
-
end
|
82
|
-
|
83
|
-
def supports_exceptions?
|
84
|
-
boundary_type == 'sentence'
|
47
|
+
def boundary_type
|
48
|
+
state_machine.boundary_type
|
85
49
|
end
|
86
50
|
|
87
|
-
|
88
|
-
match = find_cached_match(cursor)
|
51
|
+
private
|
89
52
|
|
90
|
-
|
91
|
-
|
53
|
+
def suppressions
|
54
|
+
@suppressions ||= if use_uli_exceptions?
|
55
|
+
Suppressions.instance(boundary_type, locale)
|
92
56
|
else
|
93
|
-
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
def find_cached_match(cursor)
|
98
|
-
cursor.match_cache.fetch(cursor.position) do
|
99
|
-
matches = match_all(cursor)
|
100
|
-
|
101
|
-
matches.each do |m|
|
102
|
-
cursor.match_cache[m.boundary_position - 1] ||= m
|
103
|
-
end
|
104
|
-
|
105
|
-
matches.first
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
def match_all(cursor)
|
110
|
-
each_rule.each_with_object([]) do |rule, ret|
|
111
|
-
if match = rule.match(cursor)
|
112
|
-
ret << match
|
113
|
-
end
|
57
|
+
NullSuppressions.instance
|
114
58
|
end
|
115
59
|
end
|
116
60
|
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
require 'singleton'
|
8
|
+
|
9
|
+
module TwitterCldr
|
10
|
+
module Segmentation
|
11
|
+
class StateMachine
|
12
|
+
include Singleton
|
13
|
+
|
14
|
+
START_STATE = 1
|
15
|
+
STOP_STATE = 0
|
16
|
+
NEXT_STATES = 4
|
17
|
+
ACCEPTING = 0
|
18
|
+
|
19
|
+
class << self
|
20
|
+
def instance(boundary_type, locale)
|
21
|
+
resource_path = find_resource(boundary_type, locale)
|
22
|
+
|
23
|
+
cache[resource_path] ||= begin
|
24
|
+
rsrc = TwitterCldr.get_resource(resource_path)
|
25
|
+
|
26
|
+
new(
|
27
|
+
boundary_type,
|
28
|
+
locale,
|
29
|
+
Metadata.new(rsrc[:metadata]),
|
30
|
+
StateTable.load16(rsrc[:forward_table]),
|
31
|
+
StateTable.load16(rsrc[:backward_table]),
|
32
|
+
StatusTable.load(rsrc[:status_table]),
|
33
|
+
CategoryTable.load16(rsrc[:category_table])
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def find_resource(boundary_type, locale)
|
41
|
+
path = TwitterCldr.resource_file_path(
|
42
|
+
['shared', 'segments', 'rules', locale, boundary_type]
|
43
|
+
)
|
44
|
+
|
45
|
+
return path if TwitterCldr.resource_exists?(path)
|
46
|
+
|
47
|
+
TwitterCldr.resource_file_path(
|
48
|
+
['shared', 'segments', 'rules', 'root', boundary_type]
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
def cache
|
53
|
+
@cache ||= {}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
attr_reader :boundary_type, :locale
|
58
|
+
attr_reader :metadata, :ftable, :rtable, :status_table, :category_table
|
59
|
+
|
60
|
+
def initialize(boundary_type, locale, metadata, ftable, rtable, status_table, category_table)
|
61
|
+
@boundary_type = boundary_type
|
62
|
+
@locale = locale
|
63
|
+
@metadata = metadata
|
64
|
+
@ftable = ftable
|
65
|
+
@rtable = rtable
|
66
|
+
@status_table = status_table
|
67
|
+
@category_table = category_table
|
68
|
+
end
|
69
|
+
|
70
|
+
def handle_next(cursor)
|
71
|
+
result = initial_position = cursor.position
|
72
|
+
state = START_STATE
|
73
|
+
row = row_index_for(state)
|
74
|
+
category = 3
|
75
|
+
mode = :run
|
76
|
+
|
77
|
+
if ftable.bof_required?
|
78
|
+
category = 2
|
79
|
+
mode = :start
|
80
|
+
end
|
81
|
+
|
82
|
+
until state == STOP_STATE
|
83
|
+
if cursor.eos?
|
84
|
+
break if mode == :stop
|
85
|
+
mode = :stop
|
86
|
+
category = 1
|
87
|
+
elsif mode == :run
|
88
|
+
category = category_table.get(cursor.codepoint)
|
89
|
+
|
90
|
+
if (category & 0x4000) != 0
|
91
|
+
category &= ~0x4000
|
92
|
+
end
|
93
|
+
|
94
|
+
cursor.advance
|
95
|
+
else
|
96
|
+
mode = :run
|
97
|
+
end
|
98
|
+
|
99
|
+
state = ftable[row + NEXT_STATES + category]
|
100
|
+
row = row_index_for(state)
|
101
|
+
|
102
|
+
if ftable[row + ACCEPTING] == -1
|
103
|
+
# match found
|
104
|
+
result = cursor.position
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
cursor.position = result
|
109
|
+
|
110
|
+
# don't let cursor get stuck
|
111
|
+
if cursor.position == initial_position
|
112
|
+
cursor.advance
|
113
|
+
end
|
114
|
+
|
115
|
+
result
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
def row_index_for(state)
|
121
|
+
state * (metadata.category_count + 4)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class StateTable
|
11
|
+
PACK_FMT_16 = 's!*'.freeze
|
12
|
+
BOF_REQUIRED_FLAG = 2
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def load16(hash)
|
16
|
+
new(
|
17
|
+
Base64.decode64(hash[:table]).unpack(PACK_FMT_16),
|
18
|
+
hash[:flags]
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :values, :flags
|
24
|
+
|
25
|
+
def initialize(values, flags)
|
26
|
+
@values = values
|
27
|
+
@flags = flags
|
28
|
+
end
|
29
|
+
|
30
|
+
def [](idx)
|
31
|
+
values[idx]
|
32
|
+
end
|
33
|
+
|
34
|
+
def bof_required?
|
35
|
+
flags & BOF_REQUIRED_FLAG != 0
|
36
|
+
end
|
37
|
+
|
38
|
+
def dump16
|
39
|
+
{
|
40
|
+
table: Base64.encode64(values.pack(PACK_FMT_16)).strip,
|
41
|
+
flags: flags
|
42
|
+
}
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class StatusTable
|
11
|
+
PACK_FMT = 'I!*'.freeze
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def load(hash)
|
15
|
+
new(Base64.decode64(hash[:table]).unpack(PACK_FMT))
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :values
|
20
|
+
|
21
|
+
def initialize(values)
|
22
|
+
@values = values
|
23
|
+
end
|
24
|
+
|
25
|
+
def dump
|
26
|
+
{ table: Base64.encode64(values.pack(PACK_FMT)).strip }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class Suppressions
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def instance(boundary_type, locale)
|
15
|
+
resource_path = find_resource(boundary_type, locale)
|
16
|
+
return NullSuppressions.instance unless resource_path
|
17
|
+
|
18
|
+
cache[resource_path] ||= begin
|
19
|
+
rsrc = TwitterCldr.get_resource(resource_path)
|
20
|
+
|
21
|
+
new(
|
22
|
+
Marshal.load(rsrc[:forwards_trie]),
|
23
|
+
Marshal.load(rsrc[:backwards_trie])
|
24
|
+
)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def find_resource(boundary_type, locale)
|
31
|
+
path = TwitterCldr.resource_file_path(
|
32
|
+
['shared', 'segments', 'suppressions', locale, boundary_type]
|
33
|
+
)
|
34
|
+
|
35
|
+
path if TwitterCldr.resource_exists?(path)
|
36
|
+
end
|
37
|
+
|
38
|
+
def cache
|
39
|
+
@cache ||= {}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_reader :forward_trie, :backward_trie
|
44
|
+
|
45
|
+
def initialize(forward_trie, backward_trie)
|
46
|
+
@forward_trie = forward_trie
|
47
|
+
@backward_trie = backward_trie
|
48
|
+
end
|
49
|
+
|
50
|
+
def should_break?(cursor)
|
51
|
+
idx = cursor.position
|
52
|
+
|
53
|
+
# consider case when a space follows the '.' (so we handle i.e. "Mr. Brown")
|
54
|
+
idx -= 2 if cursor.codepoint(idx - 1) == 32
|
55
|
+
node = backward_trie.root
|
56
|
+
|
57
|
+
found = loop do
|
58
|
+
break false if idx < 0 || idx >= cursor.length
|
59
|
+
node = node.child(cursor.codepoint(idx))
|
60
|
+
break false unless node
|
61
|
+
break true if node.value
|
62
|
+
idx -= 1
|
63
|
+
end
|
64
|
+
|
65
|
+
return true unless found
|
66
|
+
|
67
|
+
node = forward_trie.root
|
68
|
+
|
69
|
+
loop do
|
70
|
+
return true if idx >= cursor.length
|
71
|
+
node = node.child(cursor.codepoint(idx))
|
72
|
+
return true unless node
|
73
|
+
return false if node.value
|
74
|
+
idx += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|