twitter_cldr 5.2.0 → 5.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -4
- data/Rakefile +19 -8
- data/lib/twitter_cldr/normalization.rb +18 -5
- data/lib/twitter_cldr/resources.rb +3 -1
- data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
- data/lib/twitter_cldr/resources/loader.rb +22 -1
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
- data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
- data/lib/twitter_cldr/segmentation.rb +25 -10
- data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
- data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
- data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
- data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
- data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
- data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
- data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
- data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
- data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
- data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
- data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
- data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
- data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
- data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
- data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
- data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
- data/lib/twitter_cldr/shared.rb +1 -0
- data/lib/twitter_cldr/shared/caser.rb +3 -3
- data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
- data/lib/twitter_cldr/utils/range_set.rb +10 -1
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/km.yml +82 -0
- data/resources/collation/tailoring/lo.yml +4 -0
- data/resources/collation/tailoring/my.yml +940 -0
- data/resources/collation/tries/km.dump +0 -0
- data/resources/collation/tries/lo.dump +0 -0
- data/resources/collation/tries/my.dump +0 -0
- data/resources/locales/km/calendars.yml +373 -0
- data/resources/locales/km/currencies.yml +654 -0
- data/resources/locales/km/day_periods.yml +96 -0
- data/resources/locales/km/fields.yml +495 -0
- data/resources/locales/km/languages.yml +397 -0
- data/resources/locales/km/layout.yml +5 -0
- data/resources/locales/km/lists.yml +37 -0
- data/resources/locales/km/numbers.yml +402 -0
- data/resources/locales/km/plural_rules.yml +6 -0
- data/resources/locales/km/plurals.yml +12 -0
- data/resources/locales/km/rbnf.yml +131 -0
- data/resources/locales/km/territories.yml +267 -0
- data/resources/locales/km/timezones.yml +1471 -0
- data/resources/locales/km/units.yml +721 -0
- data/resources/locales/lo/calendars.yml +368 -0
- data/resources/locales/lo/currencies.yml +918 -0
- data/resources/locales/lo/day_periods.yml +96 -0
- data/resources/locales/lo/fields.yml +437 -0
- data/resources/locales/lo/languages.yml +529 -0
- data/resources/locales/lo/layout.yml +5 -0
- data/resources/locales/lo/lists.yml +42 -0
- data/resources/locales/lo/numbers.yml +476 -0
- data/resources/locales/lo/plural_rules.yml +7 -0
- data/resources/locales/lo/plurals.yml +14 -0
- data/resources/locales/lo/rbnf.yml +119 -0
- data/resources/locales/lo/territories.yml +265 -0
- data/resources/locales/lo/timezones.yml +1513 -0
- data/resources/locales/lo/units.yml +750 -0
- data/resources/locales/my/calendars.yml +374 -0
- data/resources/locales/my/currencies.yml +697 -0
- data/resources/locales/my/day_periods.yml +96 -0
- data/resources/locales/my/fields.yml +459 -0
- data/resources/locales/my/languages.yml +420 -0
- data/resources/locales/my/layout.yml +5 -0
- data/resources/locales/my/lists.yml +43 -0
- data/resources/locales/my/numbers.yml +417 -0
- data/resources/locales/my/plural_rules.yml +6 -0
- data/resources/locales/my/plurals.yml +12 -0
- data/resources/locales/my/rbnf.yml +145 -0
- data/resources/locales/my/territories.yml +265 -0
- data/resources/locales/my/timezones.yml +1479 -0
- data/resources/locales/my/units.yml +759 -0
- data/resources/locales/th/plurals.yml +1 -1
- data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
- data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
- data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
- data/resources/shared/segments/dictionaries/laodict.dump +0 -0
- data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
- data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
- data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
- data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
- data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
- data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
- data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
- data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
- data/resources/shared/segments/tests/line_break_test.yml +68 -68
- data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
- data/resources/supported_locales.yml +3 -0
- data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
- data/spec/segmentation/dictionary_break_spec.rb +42 -0
- data/spec/segmentation/rule_set_spec.rb +3 -1
- data/spec/timezones/tests/km.yml +12475 -0
- data/spec/timezones/tests/lo.yml +12475 -0
- data/spec/timezones/tests/my.yml +12475 -0
- metadata +87 -3
@@ -26,19 +26,10 @@ module TwitterCldr
|
|
26
26
|
)
|
27
27
|
end
|
28
28
|
|
29
|
-
def each_boundary(
|
30
|
-
return to_enum(__method__,
|
29
|
+
def each_boundary(cursor, stop = cursor.length)
|
30
|
+
return to_enum(__method__, cursor, stop) unless block_given?
|
31
31
|
|
32
|
-
cursor
|
33
|
-
|
34
|
-
# Let the state machine find the first boundary for the line
|
35
|
-
# boundary type. This helps pass nearly all the Unicode
|
36
|
-
# segmentation tests, so it must be the right thing to do.
|
37
|
-
# Normally the first boundary is the implicit start of text
|
38
|
-
# boundary, but potentially not for the line rules?
|
39
|
-
yield 0 unless state_machine.boundary_type == 'line'
|
40
|
-
|
41
|
-
until cursor.eos?
|
32
|
+
until cursor.position >= stop || cursor.eos?
|
42
33
|
state_machine.handle_next(cursor)
|
43
34
|
yield cursor.position if suppressions.should_break?(cursor)
|
44
35
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Segmentation
|
8
|
+
class SegmentIterator
|
9
|
+
attr_reader :rule_set
|
10
|
+
|
11
|
+
def initialize(rule_set)
|
12
|
+
@rule_set = rule_set
|
13
|
+
end
|
14
|
+
|
15
|
+
def each_segment(str)
|
16
|
+
return to_enum(__method__, str) unless block_given?
|
17
|
+
|
18
|
+
each_boundary(str).each_cons(2) do |start, stop|
|
19
|
+
yield str[start...stop], start, stop
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def each_boundary(str, &block)
|
24
|
+
return to_enum(__method__, str) unless block_given?
|
25
|
+
|
26
|
+
# implicit start of text boundary
|
27
|
+
yield 0
|
28
|
+
|
29
|
+
cursor = create_cursor(str)
|
30
|
+
rule_set.each_boundary(cursor, &block)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def create_cursor(str)
|
36
|
+
Cursor.new(str)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -70,7 +70,7 @@ module TwitterCldr
|
|
70
70
|
def handle_next(cursor)
|
71
71
|
result = initial_position = cursor.position
|
72
72
|
state = START_STATE
|
73
|
-
row =
|
73
|
+
row = state * (metadata.category_count + 4)
|
74
74
|
category = 3
|
75
75
|
mode = :run
|
76
76
|
|
@@ -97,7 +97,7 @@ module TwitterCldr
|
|
97
97
|
end
|
98
98
|
|
99
99
|
state = ftable[row + NEXT_STATES + category]
|
100
|
-
row =
|
100
|
+
row = state * (metadata.category_count + 4)
|
101
101
|
|
102
102
|
if ftable[row + ACCEPTING] == -1
|
103
103
|
# match found
|
@@ -114,12 +114,6 @@ module TwitterCldr
|
|
114
114
|
|
115
115
|
result
|
116
116
|
end
|
117
|
-
|
118
|
-
private
|
119
|
-
|
120
|
-
def row_index_for(state)
|
121
|
-
state * (metadata.category_count + 4)
|
122
|
-
end
|
123
117
|
end
|
124
118
|
end
|
125
119
|
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
require 'forwardable'
|
8
|
+
|
9
|
+
module TwitterCldr
|
10
|
+
module Segmentation
|
11
|
+
|
12
|
+
# See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java
|
13
|
+
class ThaiBreakEngine
|
14
|
+
|
15
|
+
include Singleton
|
16
|
+
extend Forwardable
|
17
|
+
|
18
|
+
def_delegators :engine, :each_boundary
|
19
|
+
|
20
|
+
def self.word_set
|
21
|
+
@word_set ||= begin
|
22
|
+
uset = TwitterCldr::Shared::UnicodeSet.new
|
23
|
+
uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
|
24
|
+
uset.to_set
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# ellision character
|
29
|
+
THAI_PAIYANNOI = 0x0E2F
|
30
|
+
|
31
|
+
# repeat character
|
32
|
+
THAI_MAIYAMOK = 0x0E46
|
33
|
+
|
34
|
+
def each_boundary(*args, &block)
|
35
|
+
engine.each_boundary(*args, &block)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def engine
|
41
|
+
@engine ||= BrahmicBreakEngine.new(
|
42
|
+
# How many words in a row are "good enough"?
|
43
|
+
lookahead: 3,
|
44
|
+
|
45
|
+
# Will not combine a non-word with a preceding dictionary word longer than this
|
46
|
+
root_combine_threshold: 3,
|
47
|
+
|
48
|
+
# Will not combine a non-word that shares at least this much prefix with a
|
49
|
+
# dictionary word with a preceding word
|
50
|
+
prefix_combine_threshold: 3,
|
51
|
+
|
52
|
+
# Minimum word size
|
53
|
+
min_word: 2,
|
54
|
+
|
55
|
+
# Minimum number of characters for two words (min_word * 2)
|
56
|
+
min_word_span: 4,
|
57
|
+
|
58
|
+
word_set: self.class.word_set,
|
59
|
+
mark_set: mark_set,
|
60
|
+
end_word_set: end_word_set,
|
61
|
+
begin_word_set: begin_word_set,
|
62
|
+
dictionary: Dictionary.thai,
|
63
|
+
advance_past_suffix: -> (*args) do
|
64
|
+
advance_past_suffix(*args)
|
65
|
+
end
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
def advance_past_suffix(cursor, end_pos, state)
|
70
|
+
suffix_length = 0
|
71
|
+
|
72
|
+
if cursor.position < end_pos && state.word_length > 0
|
73
|
+
uc = cursor.codepoint
|
74
|
+
|
75
|
+
candidates = state.words[state.words_found].candidates(
|
76
|
+
cursor, engine.dictionary, end_pos
|
77
|
+
)
|
78
|
+
|
79
|
+
if candidates <= 0 && suffix_set.include?(uc)
|
80
|
+
if uc == THAI_PAIYANNOI
|
81
|
+
unless suffix_set.include?(cursor.previous)
|
82
|
+
# skip over previous end and PAIYANNOI
|
83
|
+
cursor.advance(2)
|
84
|
+
suffix_length += 1
|
85
|
+
uc = cursor.codepoint
|
86
|
+
else
|
87
|
+
# restore prior position
|
88
|
+
cursor.advance
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
if uc == THAI_MAIYAMOK
|
93
|
+
if cursor.previous != THAI_MAIYAMOK
|
94
|
+
# skip over previous end and MAIYAMOK
|
95
|
+
cursor.advance(2)
|
96
|
+
suffix_length += 1
|
97
|
+
else
|
98
|
+
# restore prior position
|
99
|
+
cursor.advance
|
100
|
+
end
|
101
|
+
end
|
102
|
+
else
|
103
|
+
cursor.position = state.current + state.word_length
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
suffix_length
|
108
|
+
end
|
109
|
+
|
110
|
+
def mark_set
|
111
|
+
@mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
112
|
+
set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
|
113
|
+
set.add(0x0020)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def end_word_set
|
118
|
+
@end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
119
|
+
set.add_list(self.class.word_set)
|
120
|
+
set.subtract(0x0E31) # MAI HAN-AKAT
|
121
|
+
set.subtract_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def begin_word_set
|
126
|
+
@begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
127
|
+
set.add_range(0x0E01..0x0E2E) # KO KAI through HO NOKHUK
|
128
|
+
set.add_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def suffix_set
|
133
|
+
@suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
134
|
+
set.add(THAI_PAIYANNOI)
|
135
|
+
set.add(THAI_MAIYAMOK)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class UnhandledBreakEngine
|
11
|
+
|
12
|
+
include Singleton
|
13
|
+
|
14
|
+
def each_boundary(cursor, &block)
|
15
|
+
return to_enum(__method__, cursor) unless block_given?
|
16
|
+
cursor.advance
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class DeormalizedStringError < StandardError; end
|
11
|
+
|
12
|
+
class WordIterator < SegmentIterator
|
13
|
+
DICTIONARY_BREAK_ENGINES = [
|
14
|
+
CjBreakEngine,
|
15
|
+
KoreanBreakEngine,
|
16
|
+
BurmeseBreakEngine,
|
17
|
+
KhmerBreakEngine,
|
18
|
+
LaoBreakEngine,
|
19
|
+
ThaiBreakEngine
|
20
|
+
]
|
21
|
+
|
22
|
+
def each_boundary(str, &block)
|
23
|
+
return to_enum(__method__, str) unless block_given?
|
24
|
+
|
25
|
+
# Rather than put a bunch of duplicate logic in
|
26
|
+
# each_boundary_helper to make sure we don't yield the same
|
27
|
+
# boundary twice, we wrap it in this additional de-duping
|
28
|
+
# enumerator and call it a day.
|
29
|
+
last_boundary = nil
|
30
|
+
|
31
|
+
each_boundary_helper(str) do |boundary|
|
32
|
+
yield boundary if boundary != last_boundary
|
33
|
+
last_boundary = boundary
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def each_boundary_helper(str, &block)
|
40
|
+
# Set up two independent cursors so the algorithm can iterate
|
41
|
+
# over those portions of the input string that require a
|
42
|
+
# dictionary-based break iterator independently from those that
|
43
|
+
# only need the normal, rule-based break iterator. Cursors
|
44
|
+
# hold references to the input text and a list of all the
|
45
|
+
# corresponding Unicode codepoints, meaning they are fairly
|
46
|
+
# expensive to create. The duplication below should only
|
47
|
+
# produce a shallow copy however. The text and codepoint list
|
48
|
+
# are not duplicated, but the cursor's integer position can
|
49
|
+
# be varied independently.
|
50
|
+
dict_cursor = create_cursor(str)
|
51
|
+
rule_cursor = dict_cursor.dup
|
52
|
+
|
53
|
+
# implicit start of text boundary
|
54
|
+
yield 0
|
55
|
+
|
56
|
+
until dict_cursor.eos? || rule_cursor.eos?
|
57
|
+
# We use a regex to identify the beginnings of potential runs
|
58
|
+
# of dictionary characters. This regex was benchmarked and
|
59
|
+
# found to be pretty fast, but could become a bottleneck if
|
60
|
+
# other parts of the algorithm are improved in the future.
|
61
|
+
m = dictionary_re.match(dict_cursor.text, dict_cursor.position)
|
62
|
+
break unless m
|
63
|
+
|
64
|
+
dict_cursor.position = m.begin(0)
|
65
|
+
dict_break_engine = dictionary_break_engine_for(dict_cursor.codepoint)
|
66
|
+
|
67
|
+
# It's possible to encounter a dictionary character that can't
|
68
|
+
# be handled by any of the dictionary-based break engines
|
69
|
+
# because it's too short to make up an actual word. The
|
70
|
+
# break engine will simply yield no breaks in such a case, which
|
71
|
+
# we test for below by peeking for the first boundary value and
|
72
|
+
# rescuing a StopIteration error. Since the run of dictionary
|
73
|
+
# characters may be arbitrarily long, peeking should be more
|
74
|
+
# performant than attempting to calculate all the boundary
|
75
|
+
# positions for the run at once.
|
76
|
+
#
|
77
|
+
# It should be noted that, despite our best efforts here in
|
78
|
+
# WordIterator, certain dictionary-based break engines (eg.
|
79
|
+
# CjBreakEngine) cannot yield word boundaries without first
|
80
|
+
# examining the entire run of dictionary characters. In practice
|
81
|
+
# this shouldn't be too big an issue, since Chinese text often
|
82
|
+
# contains punctuation that should limit the average run length.
|
83
|
+
dict_enum = dict_break_engine.each_boundary(dict_cursor)
|
84
|
+
|
85
|
+
dict_boundary = begin
|
86
|
+
dict_enum.peek
|
87
|
+
rescue StopIteration
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
|
91
|
+
# If a dictionary boundary was found, attempt to use the rule-based
|
92
|
+
# break iterator to find breaks in the text immediately before it.
|
93
|
+
# Otherwise, since none of the dictionary-based break engines could
|
94
|
+
# find any boundaries in the current run, we advance the dictionary
|
95
|
+
# cursor in an attempt to find the next dictionary boundary. Doing
|
96
|
+
# so effectively causes the algorithm to fall back to the rule-based
|
97
|
+
# break engine.
|
98
|
+
if dict_boundary
|
99
|
+
# Only use the rule-based break engine if there are characters to
|
100
|
+
# process.
|
101
|
+
if rule_cursor.position < m.begin(0)
|
102
|
+
rule_set.each_boundary(rule_cursor, m.begin(0), &block)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Yield all the dictionary breaks from the enum. We can't use .each
|
106
|
+
# here because that will restart the iteration. Ruby's loop
|
107
|
+
# construct automatically rescues StopIteration.
|
108
|
+
loop do
|
109
|
+
yield dict_enum.next
|
110
|
+
end
|
111
|
+
|
112
|
+
# We've reached the end of a dictionary character run, so yield
|
113
|
+
# the end of text boundary.
|
114
|
+
yield dict_cursor.position
|
115
|
+
|
116
|
+
# These should be the same after a successful dictionary run, i.e.
|
117
|
+
# they should both be positioned at the end of the current rule-based
|
118
|
+
# and dictionary-based portions of the run, ready for the next one.
|
119
|
+
rule_cursor.position = dict_cursor.position
|
120
|
+
else
|
121
|
+
dict_cursor.advance
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Find boundaries in the straggler, non-dictionary run at the end of
|
126
|
+
# the input text.
|
127
|
+
unless rule_cursor.eos?
|
128
|
+
rule_set.each_boundary(rule_cursor, &block)
|
129
|
+
end
|
130
|
+
|
131
|
+
# implicit end of text boundary
|
132
|
+
yield rule_cursor.length
|
133
|
+
end
|
134
|
+
|
135
|
+
# all dictionary characters, i.e. characters that must be handled
|
136
|
+
# by one of the dictionary-based break engines
|
137
|
+
def dictionary_set
|
138
|
+
@@dictionary_set ||= Set.new.tap do |set|
|
139
|
+
DICTIONARY_BREAK_ENGINES.each do |break_engine|
|
140
|
+
set.merge(break_engine.word_set)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def dictionary_break_engine_for(codepoint)
|
146
|
+
codepoint_to_engine_cache[codepoint] ||= begin
|
147
|
+
engine = DICTIONARY_BREAK_ENGINES.find do |break_engine|
|
148
|
+
break_engine.word_set.include?(codepoint)
|
149
|
+
end
|
150
|
+
|
151
|
+
(engine || UnhandledBreakEngine).instance
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def dictionary_re
|
156
|
+
@@dictionary_re ||= begin
|
157
|
+
ranges = TwitterCldr::Utils::RangeSet.from_array(dictionary_set).ranges.map do |r|
|
158
|
+
"\\u{#{r.first.to_s(16)}}-\\u{#{r.last.to_s(16)}}"
|
159
|
+
end
|
160
|
+
|
161
|
+
/[#{ranges.join}]/
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def codepoint_to_engine_cache
|
166
|
+
@@codepoint_to_engine_cache ||= {}
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
data/lib/twitter_cldr/shared.rb
CHANGED
@@ -32,6 +32,7 @@ module TwitterCldr
|
|
32
32
|
autoload :TerritoriesContainment, 'twitter_cldr/shared/territories_containment'
|
33
33
|
autoload :Territory, 'twitter_cldr/shared/territory'
|
34
34
|
autoload :UnicodeRegex, 'twitter_cldr/shared/unicode_regex'
|
35
|
+
autoload :UnicodeSet, 'twitter_cldr/shared/unicode_set'
|
35
36
|
autoload :Unit, 'twitter_cldr/shared/unit'
|
36
37
|
end
|
37
38
|
end
|
@@ -25,7 +25,7 @@ module TwitterCldr
|
|
25
25
|
# and the following word boundary to Lowercase_Mapping(C).
|
26
26
|
def titlecase(string)
|
27
27
|
string.dup.tap do |result|
|
28
|
-
|
28
|
+
word_iterator.each_word(result) do |_, *boundary_pair|
|
29
29
|
if cased_pos = first_cased(string, *boundary_pair)
|
30
30
|
result[cased_pos] = titlecasing_hash[result[cased_pos]]
|
31
31
|
|
@@ -47,8 +47,8 @@ module TwitterCldr
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
51
|
-
@
|
50
|
+
def word_iterator
|
51
|
+
@word_iterator ||= Segmentation::BreakIterator.new(:en)
|
52
52
|
end
|
53
53
|
|
54
54
|
def cased?(char)
|