twitter_cldr 5.1.0 → 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +5 -5
- data/lib/twitter_cldr.rb +1 -0
- data/lib/twitter_cldr/resources.rb +2 -8
- data/lib/twitter_cldr/resources/loader.rb +6 -4
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
- data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
- data/lib/twitter_cldr/segmentation.rb +10 -8
- data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
- data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
- data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
- data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
- data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
- data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
- data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
- data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
- data/lib/twitter_cldr/shared/caser.rb +1 -1
- data/lib/twitter_cldr/shared/locale.rb +6 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/segments/rules/el/sentence.yml +723 -0
- data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
- data/resources/shared/segments/rules/ja/line.yml +964 -0
- data/resources/shared/segments/rules/ja/word.yml +527 -0
- data/resources/shared/segments/rules/root/grapheme.yml +463 -0
- data/resources/shared/segments/rules/root/line.yml +964 -0
- data/resources/shared/segments/rules/root/sentence.yml +723 -0
- data/resources/shared/segments/rules/root/word.yml +527 -0
- data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
- data/resources/shared/segments/rules/zh/line.yml +964 -0
- data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
- data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
- data/resources/shared/segments/tests/line_break_test.yml +7348 -0
- data/resources/uli/segments/de.yml +5 -230
- data/resources/uli/segments/en.yml +3 -154
- data/resources/uli/segments/es.yml +5 -145
- data/resources/uli/segments/fr.yml +5 -68
- data/resources/uli/segments/it.yml +3 -48
- data/resources/uli/segments/pt.yml +5 -173
- data/resources/uli/segments/ru.yml +3 -10
- data/spec/segmentation/rule_set_spec.rb +54 -27
- metadata +29 -9
- data/lib/twitter_cldr/resources/uli.rb +0 -12
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
- data/lib/twitter_cldr/segmentation/parser.rb +0 -71
- data/lib/twitter_cldr/segmentation/rule.rb +0 -79
- data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
- data/resources/shared/segments/segments_root.yml +0 -869
- data/spec/segmentation/parser_spec.rb +0 -104
@@ -6,10 +6,12 @@
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Segmentation
|
8
8
|
class Cursor
|
9
|
-
attr_reader :text, :
|
9
|
+
attr_reader :text, :codepoints
|
10
|
+
attr_accessor :position
|
10
11
|
|
11
12
|
def initialize(text)
|
12
13
|
@text = text
|
14
|
+
@codepoints = text.codepoints
|
13
15
|
reset
|
14
16
|
end
|
15
17
|
|
@@ -19,15 +21,18 @@ module TwitterCldr
|
|
19
21
|
|
20
22
|
def reset
|
21
23
|
@position = 0
|
22
|
-
@match_cache = {}
|
23
24
|
end
|
24
25
|
|
25
|
-
def
|
26
|
+
def eos?
|
26
27
|
position >= text.size
|
27
28
|
end
|
28
29
|
|
29
|
-
def
|
30
|
-
|
30
|
+
def codepoint(pos = position)
|
31
|
+
codepoints[pos]
|
32
|
+
end
|
33
|
+
|
34
|
+
def length
|
35
|
+
text.length
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Segmentation
|
8
|
+
class Metadata
|
9
|
+
attr_reader :values
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@values = values
|
13
|
+
end
|
14
|
+
|
15
|
+
def category_count
|
16
|
+
@category_count ||= values[:category_count]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class NullSuppressions
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
def should_break?(_cursor)
|
14
|
+
true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -8,109 +8,53 @@ module TwitterCldr
|
|
8
8
|
class RuleSet
|
9
9
|
|
10
10
|
class << self
|
11
|
-
def
|
12
|
-
|
11
|
+
def create(locale, boundary_type, options = {})
|
12
|
+
new(locale, StateMachine.instance(boundary_type, locale), options)
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
attr_reader :locale, :
|
16
|
+
attr_reader :locale, :state_machine
|
17
17
|
attr_accessor :use_uli_exceptions
|
18
18
|
|
19
19
|
alias_method :use_uli_exceptions?, :use_uli_exceptions
|
20
20
|
|
21
|
-
def initialize(locale,
|
21
|
+
def initialize(locale, state_machine, options)
|
22
22
|
@locale = locale
|
23
|
-
@
|
24
|
-
@boundary_type = boundary_type
|
23
|
+
@state_machine = state_machine
|
25
24
|
@use_uli_exceptions = options.fetch(
|
26
25
|
:use_uli_exceptions, false
|
27
26
|
)
|
28
27
|
end
|
29
28
|
|
30
29
|
def each_boundary(str)
|
31
|
-
|
32
|
-
cursor = Cursor.new(str)
|
33
|
-
last_boundary = 0
|
30
|
+
return to_enum(__method__, str) unless block_given?
|
34
31
|
|
35
|
-
|
36
|
-
yield 0
|
32
|
+
cursor = Cursor.new(str)
|
37
33
|
|
38
|
-
|
39
|
-
|
40
|
-
|
34
|
+
# Let the state machine find the first boundary for the line
|
35
|
+
# boundary type. This helps pass nearly all the Unicode
|
36
|
+
# segmentation tests, so it must be the right thing to do.
|
37
|
+
# Normally the first boundary is the implicit start of text
|
38
|
+
# boundary, but potentially not for the line rules?
|
39
|
+
yield 0 unless state_machine.boundary_type == 'line'
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
if match.boundary_position == cursor.position
|
48
|
-
cursor.advance
|
49
|
-
else
|
50
|
-
cursor.advance(
|
51
|
-
match.boundary_position - cursor.position
|
52
|
-
)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# implicit end of text boundary
|
57
|
-
yield str.size unless last_boundary == str.size
|
58
|
-
else
|
59
|
-
to_enum(__method__, str)
|
41
|
+
until cursor.eos?
|
42
|
+
state_machine.handle_next(cursor)
|
43
|
+
yield cursor.position if suppressions.should_break?(cursor)
|
60
44
|
end
|
61
45
|
end
|
62
46
|
|
63
|
-
|
64
|
-
|
65
|
-
def each_rule(&block)
|
66
|
-
if block_given?
|
67
|
-
if use_uli_exceptions? && supports_exceptions?
|
68
|
-
yield exception_rule
|
69
|
-
end
|
70
|
-
|
71
|
-
rules.each(&block)
|
72
|
-
else
|
73
|
-
to_enum(__method__)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def exception_rule
|
78
|
-
@exception_rule ||= RuleSetBuilder.exception_rule_for(
|
79
|
-
locale, boundary_type
|
80
|
-
)
|
81
|
-
end
|
82
|
-
|
83
|
-
def supports_exceptions?
|
84
|
-
boundary_type == 'sentence'
|
47
|
+
def boundary_type
|
48
|
+
state_machine.boundary_type
|
85
49
|
end
|
86
50
|
|
87
|
-
|
88
|
-
match = find_cached_match(cursor)
|
51
|
+
private
|
89
52
|
|
90
|
-
|
91
|
-
|
53
|
+
def suppressions
|
54
|
+
@suppressions ||= if use_uli_exceptions?
|
55
|
+
Suppressions.instance(boundary_type, locale)
|
92
56
|
else
|
93
|
-
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
def find_cached_match(cursor)
|
98
|
-
cursor.match_cache.fetch(cursor.position) do
|
99
|
-
matches = match_all(cursor)
|
100
|
-
|
101
|
-
matches.each do |m|
|
102
|
-
cursor.match_cache[m.boundary_position - 1] ||= m
|
103
|
-
end
|
104
|
-
|
105
|
-
matches.first
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
def match_all(cursor)
|
110
|
-
each_rule.each_with_object([]) do |rule, ret|
|
111
|
-
if match = rule.match(cursor)
|
112
|
-
ret << match
|
113
|
-
end
|
57
|
+
NullSuppressions.instance
|
114
58
|
end
|
115
59
|
end
|
116
60
|
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
require 'singleton'
|
8
|
+
|
9
|
+
module TwitterCldr
|
10
|
+
module Segmentation
|
11
|
+
class StateMachine
|
12
|
+
include Singleton
|
13
|
+
|
14
|
+
START_STATE = 1
|
15
|
+
STOP_STATE = 0
|
16
|
+
NEXT_STATES = 4
|
17
|
+
ACCEPTING = 0
|
18
|
+
|
19
|
+
class << self
|
20
|
+
def instance(boundary_type, locale)
|
21
|
+
resource_path = find_resource(boundary_type, locale)
|
22
|
+
|
23
|
+
cache[resource_path] ||= begin
|
24
|
+
rsrc = TwitterCldr.get_resource(resource_path)
|
25
|
+
|
26
|
+
new(
|
27
|
+
boundary_type,
|
28
|
+
locale,
|
29
|
+
Metadata.new(rsrc[:metadata]),
|
30
|
+
StateTable.load16(rsrc[:forward_table]),
|
31
|
+
StateTable.load16(rsrc[:backward_table]),
|
32
|
+
StatusTable.load(rsrc[:status_table]),
|
33
|
+
CategoryTable.load16(rsrc[:category_table])
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def find_resource(boundary_type, locale)
|
41
|
+
path = TwitterCldr.resource_file_path(
|
42
|
+
['shared', 'segments', 'rules', locale, boundary_type]
|
43
|
+
)
|
44
|
+
|
45
|
+
return path if TwitterCldr.resource_exists?(path)
|
46
|
+
|
47
|
+
TwitterCldr.resource_file_path(
|
48
|
+
['shared', 'segments', 'rules', 'root', boundary_type]
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
def cache
|
53
|
+
@cache ||= {}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
attr_reader :boundary_type, :locale
|
58
|
+
attr_reader :metadata, :ftable, :rtable, :status_table, :category_table
|
59
|
+
|
60
|
+
def initialize(boundary_type, locale, metadata, ftable, rtable, status_table, category_table)
|
61
|
+
@boundary_type = boundary_type
|
62
|
+
@locale = locale
|
63
|
+
@metadata = metadata
|
64
|
+
@ftable = ftable
|
65
|
+
@rtable = rtable
|
66
|
+
@status_table = status_table
|
67
|
+
@category_table = category_table
|
68
|
+
end
|
69
|
+
|
70
|
+
def handle_next(cursor)
|
71
|
+
result = initial_position = cursor.position
|
72
|
+
state = START_STATE
|
73
|
+
row = row_index_for(state)
|
74
|
+
category = 3
|
75
|
+
mode = :run
|
76
|
+
|
77
|
+
if ftable.bof_required?
|
78
|
+
category = 2
|
79
|
+
mode = :start
|
80
|
+
end
|
81
|
+
|
82
|
+
until state == STOP_STATE
|
83
|
+
if cursor.eos?
|
84
|
+
break if mode == :stop
|
85
|
+
mode = :stop
|
86
|
+
category = 1
|
87
|
+
elsif mode == :run
|
88
|
+
category = category_table.get(cursor.codepoint)
|
89
|
+
|
90
|
+
if (category & 0x4000) != 0
|
91
|
+
category &= ~0x4000
|
92
|
+
end
|
93
|
+
|
94
|
+
cursor.advance
|
95
|
+
else
|
96
|
+
mode = :run
|
97
|
+
end
|
98
|
+
|
99
|
+
state = ftable[row + NEXT_STATES + category]
|
100
|
+
row = row_index_for(state)
|
101
|
+
|
102
|
+
if ftable[row + ACCEPTING] == -1
|
103
|
+
# match found
|
104
|
+
result = cursor.position
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
cursor.position = result
|
109
|
+
|
110
|
+
# don't let cursor get stuck
|
111
|
+
if cursor.position == initial_position
|
112
|
+
cursor.advance
|
113
|
+
end
|
114
|
+
|
115
|
+
result
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
def row_index_for(state)
|
121
|
+
state * (metadata.category_count + 4)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class StateTable
|
11
|
+
PACK_FMT_16 = 's!*'.freeze
|
12
|
+
BOF_REQUIRED_FLAG = 2
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def load16(hash)
|
16
|
+
new(
|
17
|
+
Base64.decode64(hash[:table]).unpack(PACK_FMT_16),
|
18
|
+
hash[:flags]
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :values, :flags
|
24
|
+
|
25
|
+
def initialize(values, flags)
|
26
|
+
@values = values
|
27
|
+
@flags = flags
|
28
|
+
end
|
29
|
+
|
30
|
+
def [](idx)
|
31
|
+
values[idx]
|
32
|
+
end
|
33
|
+
|
34
|
+
def bof_required?
|
35
|
+
flags & BOF_REQUIRED_FLAG != 0
|
36
|
+
end
|
37
|
+
|
38
|
+
def dump16
|
39
|
+
{
|
40
|
+
table: Base64.encode64(values.pack(PACK_FMT_16)).strip,
|
41
|
+
flags: flags
|
42
|
+
}
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class StatusTable
|
11
|
+
PACK_FMT = 'I!*'.freeze
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def load(hash)
|
15
|
+
new(Base64.decode64(hash[:table]).unpack(PACK_FMT))
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :values
|
20
|
+
|
21
|
+
def initialize(values)
|
22
|
+
@values = values
|
23
|
+
end
|
24
|
+
|
25
|
+
def dump
|
26
|
+
{ table: Base64.encode64(values.pack(PACK_FMT)).strip }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class Suppressions
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def instance(boundary_type, locale)
|
15
|
+
resource_path = find_resource(boundary_type, locale)
|
16
|
+
return NullSuppressions.instance unless resource_path
|
17
|
+
|
18
|
+
cache[resource_path] ||= begin
|
19
|
+
rsrc = TwitterCldr.get_resource(resource_path)
|
20
|
+
|
21
|
+
new(
|
22
|
+
Marshal.load(rsrc[:forwards_trie]),
|
23
|
+
Marshal.load(rsrc[:backwards_trie])
|
24
|
+
)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def find_resource(boundary_type, locale)
|
31
|
+
path = TwitterCldr.resource_file_path(
|
32
|
+
['shared', 'segments', 'suppressions', locale, boundary_type]
|
33
|
+
)
|
34
|
+
|
35
|
+
path if TwitterCldr.resource_exists?(path)
|
36
|
+
end
|
37
|
+
|
38
|
+
def cache
|
39
|
+
@cache ||= {}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_reader :forward_trie, :backward_trie
|
44
|
+
|
45
|
+
def initialize(forward_trie, backward_trie)
|
46
|
+
@forward_trie = forward_trie
|
47
|
+
@backward_trie = backward_trie
|
48
|
+
end
|
49
|
+
|
50
|
+
def should_break?(cursor)
|
51
|
+
idx = cursor.position
|
52
|
+
|
53
|
+
# consider case when a space follows the '.' (so we handle i.e. "Mr. Brown")
|
54
|
+
idx -= 2 if cursor.codepoint(idx - 1) == 32
|
55
|
+
node = backward_trie.root
|
56
|
+
|
57
|
+
found = loop do
|
58
|
+
break false if idx < 0 || idx >= cursor.length
|
59
|
+
node = node.child(cursor.codepoint(idx))
|
60
|
+
break false unless node
|
61
|
+
break true if node.value
|
62
|
+
idx -= 1
|
63
|
+
end
|
64
|
+
|
65
|
+
return true unless found
|
66
|
+
|
67
|
+
node = forward_trie.root
|
68
|
+
|
69
|
+
loop do
|
70
|
+
return true if idx >= cursor.length
|
71
|
+
node = node.child(cursor.codepoint(idx))
|
72
|
+
return true unless node
|
73
|
+
return false if node.value
|
74
|
+
idx += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|