twitter_cldr 5.1.0 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,12 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Resources
8
- module Uli
9
- autoload :SegmentExceptionsImporter, 'twitter_cldr/resources/uli/segment_exceptions_importer'
10
- end
11
- end
12
- end
@@ -1,59 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- require 'fileutils'
7
- require 'open-uri'
8
- require 'json'
9
-
10
- module TwitterCldr
11
- module Resources
12
- module Uli
13
- class SegmentExceptionsImporter < Resources::Importer
14
-
15
- REPO_URL = 'https://github.com/unicode-org/uli.git'.freeze
16
- GIT_SHA = '6acce954b913b121b6ab4bd4f8395e74dce2ae7c'.freeze
17
-
18
- requirement :git, REPO_URL, GIT_SHA
19
- output_path 'uli/segments'
20
- ruby_engine :mri
21
-
22
- def execute
23
- FileUtils.mkdir_p(output_path)
24
- each_file { |file| import_file(file) }
25
- end
26
-
27
- private
28
-
29
- def output_path
30
- params.fetch(:output_path)
31
- end
32
-
33
- def import_file(file)
34
- locale = File.basename(file).chomp('.json')
35
- output_file = File.join(output_path, "#{locale}.yml")
36
- exceptions = JSON.parse(File.read(file))
37
-
38
- File.open(output_file, 'w:utf-8') do |output|
39
- output.write(
40
- TwitterCldr::Utils::YAML.dump(
41
- TwitterCldr::Utils.deep_symbolize_keys(locale => { exceptions: exceptions['data']['abbrs'] }),
42
- use_natural_symbols: true
43
- )
44
- )
45
- end
46
- end
47
-
48
- def each_file(&block)
49
- Dir.glob(File.join(input_path, 'abbrs', 'json', '*.json')).each(&block)
50
- end
51
-
52
- def input_path
53
- requirements[:git].source_path
54
- end
55
-
56
- end
57
- end
58
- end
59
- end
@@ -1,71 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- class Parser
9
-
10
- def parse(text, options = {})
11
- left_str, boundary_symbol_str, right_str = text.split(/([÷×])/)
12
- boundary_symbol = boundary_symbol_for(boundary_symbol_str)
13
- left = compile_token_list(tokenize_regex(left_str || ''), options)
14
- right = compile_token_list(tokenize_regex(right_str || ''), options)
15
- klass = class_for(boundary_symbol)
16
- klass.new(left, right)
17
- end
18
-
19
- def tokenize_regex(text)
20
- regex_tokenizer.tokenize(text).reject do |token|
21
- token.value.strip.empty?
22
- end
23
- end
24
-
25
- private
26
-
27
- def boundary_symbol_for(str)
28
- case str
29
- when '÷' then :break
30
- when '×' then :no_break
31
- end
32
- end
33
-
34
- def class_for(boundary_symbol)
35
- case boundary_symbol
36
- when :break
37
- BreakRule
38
- when :no_break
39
- NoBreakRule
40
- end
41
- end
42
-
43
- def compile_token_list(token_list, options)
44
- if token_list.empty?
45
- TwitterCldr::Shared::UnicodeRegex.compile('')
46
- else
47
- parse_regex(token_list, options)
48
- end
49
- end
50
-
51
- def parse_regex(tokens, options)
52
- unless tokens.empty?
53
- TwitterCldr::Shared::UnicodeRegex.new(
54
- regex_parser.parse(tokens, options), 'm'
55
- )
56
- end
57
- end
58
-
59
- def regex_tokenizer
60
- @tokenizer ||=
61
- TwitterCldr::Tokenizers::UnicodeRegexTokenizer.new
62
- end
63
-
64
- def regex_parser
65
- @regex_parser ||=
66
- TwitterCldr::Parsers::UnicodeRegexParser.new
67
- end
68
-
69
- end
70
- end
71
- end
@@ -1,79 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- RuleMatchData = Struct.new(
9
- :rule, :boundary_offset, :boundary_position
10
- )
11
-
12
- class Rule
13
-
14
- attr_reader :left, :right
15
- attr_accessor :string, :id
16
-
17
- def initialize(left, right)
18
- @left = left
19
- @right = right
20
- end
21
-
22
- def match(cursor)
23
- left_match = match_side(left, cursor.text, cursor.position)
24
- return nil unless left_match
25
- left_match_offset = offset(left_match, cursor.position)
26
-
27
- right_match = match_side(right, cursor.text, left_match_offset.last)
28
- return nil unless right_match
29
- right_match_offset = offset(right_match, left_match_offset.last)
30
-
31
- offset = [left_match_offset.first, right_match_offset.last]
32
- position = left_match_offset.last
33
-
34
- RuleMatchData.new(self, offset, position)
35
- end
36
-
37
- private
38
-
39
- def offset(match, default)
40
- if match
41
- match.offset(0)
42
- else
43
- [default, default]
44
- end
45
- end
46
-
47
- def match_side(side, text, position)
48
- if side
49
- side_match = side.match(text, position)
50
-
51
- if side_match && side_match.begin(0) == position
52
- side_match
53
- end
54
- end
55
- end
56
- end
57
-
58
- class BreakRule < Rule
59
- def boundary_symbol
60
- :break
61
- end
62
-
63
- def break?
64
- true
65
- end
66
- end
67
-
68
- class NoBreakRule < Rule
69
- def boundary_symbol
70
- :no_break
71
- end
72
-
73
- def break?
74
- false
75
- end
76
- end
77
-
78
- end
79
- end
@@ -1,142 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- class RuleSetBuilder
9
-
10
- class << self
11
- def load(locale, boundary_type, options = {})
12
- rules = compile_rules_for(boundary_type)
13
- RuleSet.new(locale, rules, boundary_type, options)
14
- end
15
-
16
- # See the comment above exceptions_for. Basically, we only support exceptions
17
- # for the "sentence" boundary type since the ULI JSON data doesn't distinguish
18
- # between boundary types.
19
- def exception_rule_for(locale, boundary_type)
20
- cache_key = TwitterCldr::Utils.compute_cache_key(locale, boundary_type)
21
- exceptions_cache[cache_key] ||= begin
22
- exceptions = exceptions_for(locale, boundary_type)
23
- regex_contents = exceptions.map { |exc| Regexp.escape(exc) }.join("|")
24
- parse("(?:#{regex_contents}) ×", nil).tap do |rule|
25
- rule.id = 0
26
- end
27
- end
28
- end
29
-
30
- # The implicit final rule is always "Any ÷ Any"
31
- def implicit_final_rule
32
- @implicit_final_rule ||=
33
- parse('. ÷ .', nil).tap do |rule|
34
- rule.id = 9999
35
- end
36
- end
37
-
38
- # The implicit initial rules are always "start-of-text ÷"
39
- # and "÷ end-of-text". We don't need the start-of-text one.
40
- def implicit_end_of_text_rule
41
- @implicit_end_of_text_rule ||=
42
- parse('.\z ÷', nil).tap do |rule|
43
- rule.id = 9998
44
- end
45
- end
46
-
47
- private
48
-
49
- # The boundary_type param is not currently used since the ULI JSON resource that
50
- # exceptions are generated from does not distinguish between boundary types. The
51
- # XML version does, however, so the JSON will hopefully catch up at some point and
52
- # we can make use of this second parameter. For the time being, compile_exception_rule_for
53
- # (which calls this function) assumes a "sentence" boundary type.
54
- def exceptions_for(locale, boundary_type)
55
- exceptions_resource_cache[locale] ||= begin
56
- TwitterCldr.get_resource('uli', 'segments', locale)[locale][:exceptions]
57
- rescue Resources::ResourceLoadError
58
- []
59
- end
60
- end
61
-
62
- def boundary_name_for(str)
63
- str.gsub(/(?:^|\_)([A-Za-z])/) { |s| $1.upcase } + 'Break'
64
- end
65
-
66
- # tokenizes and parses rules from segment_root
67
- def compile_rules_for(boundary_type)
68
- rule_cache[boundary_type] ||= begin
69
- boundary_name = boundary_name_for(boundary_type)
70
- boundary_data = resource_for(boundary_name)
71
- symbol_table = symbol_table_for(boundary_data)
72
- rules_for(boundary_data, symbol_table)
73
- end
74
- end
75
-
76
- def symbol_table_for(boundary_data)
77
- table = TwitterCldr::Parsers::SymbolTable.new
78
- boundary_data[:variables].each do |variable|
79
- id = variable[:id].to_s
80
- tokens = segmentation_parser.tokenize_regex(variable[:value])
81
- # note: variables can be redefined (add replaces if key already exists)
82
- table.add(id, resolve_symbols(tokens, table))
83
- end
84
- table
85
- end
86
-
87
- def resolve_symbols(tokens, symbol_table)
88
- tokens.inject([]) do |ret, token|
89
- if token.type == :variable
90
- ret += symbol_table.fetch(token.value)
91
- else
92
- ret << token
93
- end
94
- ret
95
- end
96
- end
97
-
98
- def rules_for(boundary_data, symbol_table)
99
- boundary_data[:rules].map do |rule|
100
- r = parse(rule[:value], symbol_table)
101
- r.string = rule[:value]
102
- r.id = rule[:id]
103
- r
104
- end
105
- end
106
-
107
- def parse(text, symbol_table)
108
- segmentation_parser.parse(
109
- text, { symbol_table: symbol_table }
110
- )
111
- end
112
-
113
- def resource_for(boundary_name)
114
- root_resource[:segments][boundary_name.to_sym]
115
- end
116
-
117
- def segmentation_parser
118
- @segmentation_parser ||= Segmentation::Parser.new
119
- end
120
-
121
- def root_resource
122
- @root_resource ||= TwitterCldr.get_resource(
123
- 'shared', 'segments', 'segments_root'
124
- )
125
- end
126
-
127
- def rule_cache
128
- @rule_cache ||= {}
129
- end
130
-
131
- def exceptions_resource_cache
132
- @exceptions_resource_cache ||= {}
133
- end
134
-
135
- def exceptions_cache
136
- @exceptions_cache ||= {}
137
- end
138
- end
139
-
140
- end
141
- end
142
- end
@@ -1,869 +0,0 @@
1
- ---
2
- :segments:
3
- :GraphemeClusterBreak:
4
- :rules:
5
- -
6
- :id: 3
7
- :value: " $CR × $LF "
8
- -
9
- :id: 4
10
- :value: " ( $Control | $CR | $LF ) ÷ "
11
- -
12
- :id: 5
13
- :value: " ÷ ( $Control | $CR | $LF ) "
14
- -
15
- :id: 6
16
- :value: " $L × ( $L | $V | $LV | $LVT ) "
17
- -
18
- :id: 7
19
- :value: " ( $LV | $V ) × ( $V | $T ) "
20
- -
21
- :id: 8
22
- :value: " ( $LVT | $T) × $T "
23
- -
24
- :id: 9
25
- :value: " × ($Extend | $ZWJ) "
26
- -
27
- :id: 9.1
28
- :value: " × $SpacingMark "
29
- -
30
- :id: 9.2
31
- :value: " $Prepend × "
32
- -
33
- :id: 9.3
34
- :value: " $LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* × $LinkingConsonant "
35
- -
36
- :id: 11
37
- :value: " $ExtPict $Extend* $ZWJ × $ExtPict "
38
- -
39
- :id: 12
40
- :value: " ^ ($RI $RI)* $RI × $RI "
41
- -
42
- :id: 13
43
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
44
- :variables:
45
- -
46
- :id: $CR
47
- :value: "\\p{Grapheme_Cluster_Break=CR}"
48
- -
49
- :id: $LF
50
- :value: "\\p{Grapheme_Cluster_Break=LF}"
51
- -
52
- :id: $Control
53
- :value: "\\p{Grapheme_Cluster_Break=Control}"
54
- -
55
- :id: $Extend
56
- :value: "\\p{Grapheme_Cluster_Break=Extend}"
57
- -
58
- :id: $ZWJ
59
- :value: "\\p{Grapheme_Cluster_Break=ZWJ}"
60
- -
61
- :id: $RI
62
- :value: "\\p{Grapheme_Cluster_Break=Regional_Indicator}"
63
- -
64
- :id: $Prepend
65
- :value: "\\p{Grapheme_Cluster_Break=Prepend}"
66
- -
67
- :id: $SpacingMark
68
- :value: "\\p{Grapheme_Cluster_Break=SpacingMark}"
69
- -
70
- :id: $L
71
- :value: "\\p{Grapheme_Cluster_Break=L}"
72
- -
73
- :id: $V
74
- :value: "\\p{Grapheme_Cluster_Break=V}"
75
- -
76
- :id: $T
77
- :value: "\\p{Grapheme_Cluster_Break=T}"
78
- -
79
- :id: $LV
80
- :value: "\\p{Grapheme_Cluster_Break=LV}"
81
- -
82
- :id: $LVT
83
- :value: "\\p{Grapheme_Cluster_Break=LVT}"
84
- -
85
- :id: $Virama
86
- :value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]"
87
- -
88
- :id: $LinkingConsonant
89
- :value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]"
90
- -
91
- :id: $ExtPict
92
- :value: "\\p{Extended_Pictographic}"
93
- -
94
- :id: $ExtCccZwj
95
- :value: "[[$Extend-\\p{ccc=0}] $ZWJ]"
96
- :LineBreak:
97
- :rules:
98
- -
99
- :id: 4
100
- :value: " $BK ÷ "
101
- -
102
- :id: 5.01
103
- :value: " $CR × $LF "
104
- -
105
- :id: 5.02
106
- :value: " $CR ÷ "
107
- -
108
- :id: 5.03
109
- :value: " $LF ÷ "
110
- -
111
- :id: 5.04
112
- :value: " $NL ÷ "
113
- -
114
- :id: 6
115
- :value: " × ( $BK | $CR | $LF | $NL ) "
116
- -
117
- :id: 7.01
118
- :value: " × $SP "
119
- -
120
- :id: 7.02
121
- :value: " × $ZW "
122
- -
123
- :id: 8
124
- :value: " $ZW $SP* ÷ "
125
- -
126
- :id: 8.1
127
- :value: " $ZWJ_O × "
128
- -
129
- :id: 9
130
- :value: " $Spec2_ × $CM "
131
- -
132
- :id: 11.01
133
- :value: " × $WJ "
134
- -
135
- :id: 11.02
136
- :value: " $WJ × "
137
- -
138
- :id: 12
139
- :value: " $GL × "
140
- -
141
- :id: 12.1
142
- :value: " $Spec3a_ × $GL "
143
- -
144
- :id: 12.2
145
- :value: " $Spec3b_ $CM+ × $GL "
146
- -
147
- :id: 12.3
148
- :value: " ^ $CM+ × $GL "
149
- -
150
- :id: 13.01
151
- :value: " × $EX "
152
- -
153
- :id: 13.02
154
- :value: " $Spec4_ × ($CL | $CP | $IS | $SY) "
155
- -
156
- :id: 13.03
157
- :value: " $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) "
158
- -
159
- :id: 13.04
160
- :value: " ^ $CM+ × ($CL | $CP | $IS | $SY) "
161
- -
162
- :id: 14
163
- :value: " $OP $SP* × "
164
- -
165
- :id: 15
166
- :value: " $QU $SP* × $OP "
167
- -
168
- :id: 16
169
- :value: " ($CL | $CP) $SP* × $NS "
170
- -
171
- :id: 17
172
- :value: " $B2 $SP* × $B2 "
173
- -
174
- :id: 18
175
- :value: " $SP ÷ "
176
- -
177
- :id: 19.01
178
- :value: " × $QU "
179
- -
180
- :id: 19.02
181
- :value: " $QU × "
182
- -
183
- :id: 20.01
184
- :value: " ÷ $CB "
185
- -
186
- :id: 20.02
187
- :value: " $CB ÷ "
188
- -
189
- :id: 20.09
190
- :value: " $Spec5_ $HY × $AL "
191
- -
192
- :id: 21.01
193
- :value: " × $BA "
194
- -
195
- :id: 21.02
196
- :value: " × $HY "
197
- -
198
- :id: 21.03
199
- :value: " × $NS "
200
- -
201
- :id: 21.04
202
- :value: " $BB × "
203
- -
204
- :id: 21.1
205
- :value: " $HL ($HY | $BA) × "
206
- -
207
- :id: 21.2
208
- :value: " $SY × $HL "
209
- -
210
- :id: 22.01
211
- :value: " ($AL | $HL) × $IN "
212
- -
213
- :id: 22.02
214
- :value: " $EX × $IN "
215
- -
216
- :id: 22.03
217
- :value: " ($ID | $EB | $EM) × $IN "
218
- -
219
- :id: 22.04
220
- :value: " $IN × $IN "
221
- -
222
- :id: 22.05
223
- :value: " $NU × $IN "
224
- -
225
- :id: 23.02
226
- :value: " ($AL | $HL) × $NU "
227
- -
228
- :id: 23.03
229
- :value: " $NU × ($AL | $HL) "
230
- -
231
- :id: 23.12
232
- :value: " $PR × ($ID | $EB | $EM) "
233
- -
234
- :id: 23.13
235
- :value: " ($ID | $EB | $EM) × $PO "
236
- -
237
- :id: 24.02
238
- :value: " ($PR | $PO) × ($AL | $HL) "
239
- -
240
- :id: 24.03
241
- :value: " ($AL | $HL) × ($PR | $PO) "
242
- -
243
- :id: 25.01
244
- :value: " ($PR | $PO) × ( $OP | $HY )? $NU "
245
- -
246
- :id: 25.02
247
- :value: " ( $OP | $HY ) × $NU "
248
- -
249
- :id: 25.03
250
- :value: " $NU × ($NU | $SY | $IS) "
251
- -
252
- :id: 25.04
253
- :value: " $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) "
254
- -
255
- :id: 25.05
256
- :value: " $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) "
257
- -
258
- :id: 26.01
259
- :value: " $JL × $JL | $JV | $H2 | $H3 "
260
- -
261
- :id: 26.02
262
- :value: " $JV | $H2 × $JV | $JT "
263
- -
264
- :id: 26.03
265
- :value: " $JT | $H3 × $JT "
266
- -
267
- :id: 27.01
268
- :value: " $JL | $JV | $JT | $H2 | $H3 × $IN "
269
- -
270
- :id: 27.02
271
- :value: " $JL | $JV | $JT | $H2 | $H3 × $PO "
272
- -
273
- :id: 27.03
274
- :value: " $PR × $JL | $JV | $JT | $H2 | $H3 "
275
- -
276
- :id: 28
277
- :value: " ($AL | $HL) × ($AL | $HL) "
278
- -
279
- :id: 29
280
- :value: " $IS × ($AL | $HL) "
281
- -
282
- :id: 30.01
283
- :value: " ($AL | $HL | $NU) × $OP "
284
- -
285
- :id: 30.02
286
- :value: " $CP × ($AL | $HL | $NU) "
287
- -
288
- :id: 30.11
289
- :value: " ^ ($RI $RI)* $RI × $RI "
290
- -
291
- :id: 30.12
292
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
293
- -
294
- :id: 30.13
295
- :value: " $RI ÷ $RI "
296
- -
297
- :id: 30.2
298
- :value: " $EB × $EM "
299
- :variables:
300
- -
301
- :id: $AI
302
- :value: "\\p{Line_Break=Ambiguous}"
303
- -
304
- :id: $AL
305
- :value: "\\p{Line_Break=Alphabetic}"
306
- -
307
- :id: $B2
308
- :value: "\\p{Line_Break=Break_Both}"
309
- -
310
- :id: $BA
311
- :value: "\\p{Line_Break=Break_After}"
312
- -
313
- :id: $BB
314
- :value: "\\p{Line_Break=Break_Before}"
315
- -
316
- :id: $BK
317
- :value: "\\p{Line_Break=Mandatory_Break}"
318
- -
319
- :id: $CB
320
- :value: "\\p{Line_Break=Contingent_Break}"
321
- -
322
- :id: $CL
323
- :value: "\\p{Line_Break=Close_Punctuation}"
324
- -
325
- :id: $CP
326
- :value: "\\p{Line_Break=CP}"
327
- -
328
- :id: $CM1
329
- :value: "\\p{Line_Break=Combining_Mark}"
330
- -
331
- :id: $CR
332
- :value: "\\p{Line_Break=Carriage_Return}"
333
- -
334
- :id: $EX
335
- :value: "\\p{Line_Break=Exclamation}"
336
- -
337
- :id: $GL
338
- :value: "\\p{Line_Break=Glue}"
339
- -
340
- :id: $H2
341
- :value: "\\p{Line_Break=H2}"
342
- -
343
- :id: $H3
344
- :value: "\\p{Line_Break=H3}"
345
- -
346
- :id: $HL
347
- :value: "\\p{Line_Break=HL}"
348
- -
349
- :id: $HY
350
- :value: "\\p{Line_Break=Hyphen}"
351
- -
352
- :id: $ID
353
- :value: "\\p{Line_Break=Ideographic}"
354
- -
355
- :id: $IN
356
- :value: "\\p{Line_Break=Inseparable}"
357
- -
358
- :id: $IS
359
- :value: "\\p{Line_Break=Infix_Numeric}"
360
- -
361
- :id: $JL
362
- :value: "\\p{Line_Break=JL}"
363
- -
364
- :id: $JT
365
- :value: "\\p{Line_Break=JT}"
366
- -
367
- :id: $JV
368
- :value: "\\p{Line_Break=JV}"
369
- -
370
- :id: $LF
371
- :value: "\\p{Line_Break=Line_Feed}"
372
- -
373
- :id: $NL
374
- :value: "\\p{Line_Break=Next_Line}"
375
- -
376
- :id: $NS
377
- :value: "\\p{Line_Break=Nonstarter}"
378
- -
379
- :id: $NU
380
- :value: "\\p{Line_Break=Numeric}"
381
- -
382
- :id: $OP
383
- :value: "\\p{Line_Break=Open_Punctuation}"
384
- -
385
- :id: $PO
386
- :value: "\\p{Line_Break=Postfix_Numeric}"
387
- -
388
- :id: $PR
389
- :value: "\\p{Line_Break=Prefix_Numeric}"
390
- -
391
- :id: $QU
392
- :value: "\\p{Line_Break=Quotation}"
393
- -
394
- :id: $SA
395
- :value: "\\p{Line_Break=Complex_Context}"
396
- -
397
- :id: $SG
398
- :value: "\\p{Line_Break=Surrogate}"
399
- -
400
- :id: $SP
401
- :value: "\\p{Line_Break=Space}"
402
- -
403
- :id: $SY
404
- :value: "\\p{Line_Break=Break_Symbols}"
405
- -
406
- :id: $WJ
407
- :value: "\\p{Line_Break=Word_Joiner}"
408
- -
409
- :id: $XX
410
- :value: "\\p{Line_Break=Unknown}"
411
- -
412
- :id: $ZW
413
- :value: "\\p{Line_Break=ZWSpace}"
414
- -
415
- :id: $CJ
416
- :value: "\\p{Line_Break=Conditional_Japanese_Starter}"
417
- -
418
- :id: $RI
419
- :value: "\\p{Line_Break=Regional_Indicator}"
420
- -
421
- :id: $EB
422
- :value: "\\p{Line_Break=E_Base}"
423
- -
424
- :id: $EM
425
- :value: "\\p{Line_Break=E_Modifier}"
426
- -
427
- :id: $ZWJ_O
428
- :value: "\\p{Line_Break=ZWJ}"
429
- -
430
- :id: $ZWJ
431
- :value: "\\p{Line_Break=ZWJ}"
432
- -
433
- :id: $CM
434
- :value: "[$CM1 $ZWJ]"
435
- -
436
- :id: $AL
437
- :value: "[$AI $AL $SG $XX $SA]"
438
- -
439
- :id: $NS
440
- :value: "[$NS $CJ]"
441
- -
442
- :id: $X
443
- :value: $CM*
444
- -
445
- :id: $Spec1_
446
- :value: "[$SP $BK $CR $LF $NL $ZW]"
447
- -
448
- :id: $Spec2_
449
- :value: "[^ $SP $BK $CR $LF $NL $ZW]"
450
- -
451
- :id: $Spec3a_
452
- :value: "[^ $SP $BA $HY $CM]"
453
- -
454
- :id: $Spec3b_
455
- :value: "[^ $BA $HY $CM]"
456
- -
457
- :id: $Spec4_
458
- :value: "[^ $NU $CM]"
459
- -
460
- :id: $Spec5_
461
- :value: "[$BK $CB $CR $LF $NL $SP $ZW]"
462
- -
463
- :id: $AI
464
- :value: "($AI $X)"
465
- -
466
- :id: $AL
467
- :value: "($AL $X)"
468
- -
469
- :id: $B2
470
- :value: "($B2 $X)"
471
- -
472
- :id: $BA
473
- :value: "($BA $X)"
474
- -
475
- :id: $BB
476
- :value: "($BB $X)"
477
- -
478
- :id: $CB
479
- :value: "($CB $X)"
480
- -
481
- :id: $CL
482
- :value: "($CL $X)"
483
- -
484
- :id: $CP
485
- :value: "($CP $X)"
486
- -
487
- :id: $CM
488
- :value: "($CM $X)"
489
- -
490
- :id: $EX
491
- :value: "($EX $X)"
492
- -
493
- :id: $GL
494
- :value: "($GL $X)"
495
- -
496
- :id: $H2
497
- :value: "($H2 $X)"
498
- -
499
- :id: $H3
500
- :value: "($H3 $X)"
501
- -
502
- :id: $HL
503
- :value: "($HL $X)"
504
- -
505
- :id: $HY
506
- :value: "($HY $X)"
507
- -
508
- :id: $ID
509
- :value: "($ID $X)"
510
- -
511
- :id: $IN
512
- :value: "($IN $X)"
513
- -
514
- :id: $IS
515
- :value: "($IS $X)"
516
- -
517
- :id: $JL
518
- :value: "($JL $X)"
519
- -
520
- :id: $JT
521
- :value: "($JT $X)"
522
- -
523
- :id: $JV
524
- :value: "($JV $X)"
525
- -
526
- :id: $NS
527
- :value: "($NS $X)"
528
- -
529
- :id: $NU
530
- :value: "($NU $X)"
531
- -
532
- :id: $OP
533
- :value: "($OP $X)"
534
- -
535
- :id: $PO
536
- :value: "($PO $X)"
537
- -
538
- :id: $PR
539
- :value: "($PR $X)"
540
- -
541
- :id: $QU
542
- :value: "($QU $X)"
543
- -
544
- :id: $SA
545
- :value: "($SA $X)"
546
- -
547
- :id: $SG
548
- :value: "($SG $X)"
549
- -
550
- :id: $SY
551
- :value: "($SY $X)"
552
- -
553
- :id: $WJ
554
- :value: "($WJ $X)"
555
- -
556
- :id: $XX
557
- :value: "($XX $X)"
558
- -
559
- :id: $RI
560
- :value: "($RI $X)"
561
- -
562
- :id: $EB
563
- :value: "($EB $X)"
564
- -
565
- :id: $EM
566
- :value: "($EM $X)"
567
- -
568
- :id: $ZWJ
569
- :value: "($ZWJ $X)"
570
- -
571
- :id: $AL
572
- :value: "($AL | ^ $CM | (?<=$Spec1_) $CM)"
573
- :SentenceBreak:
574
- :rules:
575
- -
576
- :id: 3
577
- :value: " $CR × $LF "
578
- -
579
- :id: 4
580
- :value: " $ParaSep ÷ "
581
- -
582
- :id: 5
583
- :value: " × [$Format $Extend] "
584
- -
585
- :id: 6
586
- :value: " $ATerm × $Numeric "
587
- -
588
- :id: 7
589
- :value: " ($Upper | $Lower) $ATerm × $Upper "
590
- -
591
- :id: 8
592
- :value: " $ATerm $Close* $Sp* × $NotPreLower_* $Lower "
593
- -
594
- :id: 8.1
595
- :value: " $SATerm $Close* $Sp* × ($SContinue | $SATerm) "
596
- -
597
- :id: 9
598
- :value: " $SATerm $Close* × ( $Close | $Sp | $ParaSep ) "
599
- -
600
- :id: 10
601
- :value: " $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) "
602
- -
603
- :id: 11
604
- :value: " $SATerm $Close* $Sp* $ParaSep? ÷ "
605
- -
606
- :id: 998
607
- :value: " × $Any "
608
- :variables:
609
- -
610
- :id: $CR
611
- :value: "\\p{Sentence_Break=CR}"
612
- -
613
- :id: $LF
614
- :value: "\\p{Sentence_Break=LF}"
615
- -
616
- :id: $Extend
617
- :value: "\\p{Sentence_Break=Extend}"
618
- -
619
- :id: $Format
620
- :value: "\\p{Sentence_Break=Format}"
621
- -
622
- :id: $Sep
623
- :value: "\\p{Sentence_Break=Sep}"
624
- -
625
- :id: $Sp
626
- :value: "\\p{Sentence_Break=Sp}"
627
- -
628
- :id: $Lower
629
- :value: "\\p{Sentence_Break=Lower}"
630
- -
631
- :id: $Upper
632
- :value: "\\p{Sentence_Break=Upper}"
633
- -
634
- :id: $OLetter
635
- :value: "\\p{Sentence_Break=OLetter}"
636
- -
637
- :id: $Numeric
638
- :value: "\\p{Sentence_Break=Numeric}"
639
- -
640
- :id: $ATerm
641
- :value: "\\p{Sentence_Break=ATerm}"
642
- -
643
- :id: $STerm
644
- :value: "\\p{Sentence_Break=STerm}"
645
- -
646
- :id: $Close
647
- :value: "\\p{Sentence_Break=Close}"
648
- -
649
- :id: $SContinue
650
- :value: "\\p{Sentence_Break=SContinue}"
651
- -
652
- :id: $Any
653
- :value: "."
654
- -
655
- :id: $FE
656
- :value: "[$Format $Extend]"
657
- -
658
- :id: $NotPreLower_
659
- :value: "[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]"
660
- -
661
- :id: $Sp
662
- :value: "($Sp $FE*)"
663
- -
664
- :id: $Lower
665
- :value: "($Lower $FE*)"
666
- -
667
- :id: $Upper
668
- :value: "($Upper $FE*)"
669
- -
670
- :id: $OLetter
671
- :value: "($OLetter $FE*)"
672
- -
673
- :id: $Numeric
674
- :value: "($Numeric $FE*)"
675
- -
676
- :id: $ATerm
677
- :value: "($ATerm $FE*)"
678
- -
679
- :id: $STerm
680
- :value: "($STerm $FE*)"
681
- -
682
- :id: $Close
683
- :value: "($Close $FE*)"
684
- -
685
- :id: $SContinue
686
- :value: "($SContinue $FE*)"
687
- -
688
- :id: $ParaSep
689
- :value: "($Sep | $CR | $LF)"
690
- -
691
- :id: $SATerm
692
- :value: "($STerm | $ATerm)"
693
- :WordBreak:
694
- :rules:
695
- -
696
- :id: 3
697
- :value: " $CR × $LF "
698
- -
699
- :id: 3.1
700
- :value: " ($Newline | $CR | $LF) ÷ "
701
- -
702
- :id: 3.2
703
- :value: " ÷ ($Newline | $CR | $LF) "
704
- -
705
- :id: 3.3
706
- :value: " $ZWJ × $ExtPict "
707
- -
708
- :id: 3.4
709
- :value: " $WSegSpace × $WSegSpace "
710
- -
711
- :id: 4
712
- :value: " $NotBreak_ × [$Format $Extend $ZWJ] "
713
- -
714
- :id: 5
715
- :value: " $AHLetter × $AHLetter "
716
- -
717
- :id: 6
718
- :value: " $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter "
719
- -
720
- :id: 7
721
- :value: " $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter "
722
- -
723
- :id: 7.1
724
- :value: " $Hebrew_Letter × $Single_Quote "
725
- -
726
- :id: 7.2
727
- :value: " $Hebrew_Letter × $Double_Quote $Hebrew_Letter "
728
- -
729
- :id: 7.3
730
- :value: " $Hebrew_Letter $Double_Quote × $Hebrew_Letter "
731
- -
732
- :id: 8
733
- :value: " $Numeric × $Numeric "
734
- -
735
- :id: 9
736
- :value: " $AHLetter × $Numeric "
737
- -
738
- :id: 10
739
- :value: " $Numeric × $AHLetter "
740
- -
741
- :id: 11
742
- :value: " $Numeric ($MidNum | $MidNumLetQ) × $Numeric "
743
- -
744
- :id: 12
745
- :value: " $Numeric × ($MidNum | $MidNumLetQ) $Numeric "
746
- -
747
- :id: 13
748
- :value: " $Katakana × $Katakana "
749
- -
750
- :id: 13.1
751
- :value: " ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet "
752
- -
753
- :id: 13.2
754
- :value: " $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) "
755
- -
756
- :id: 15
757
- :value: " ^ ($RI $RI)* $RI × $RI "
758
- -
759
- :id: 16
760
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
761
- :variables:
762
- -
763
- :id: $CR
764
- :value: "\\p{Word_Break=CR}"
765
- -
766
- :id: $LF
767
- :value: "\\p{Word_Break=LF}"
768
- -
769
- :id: $Newline
770
- :value: "\\p{Word_Break=Newline}"
771
- -
772
- :id: $Extend
773
- :value: "\\p{Word_Break=Extend}"
774
- -
775
- :id: $Format
776
- :value: "\\p{Word_Break=Format}"
777
- -
778
- :id: $Katakana
779
- :value: "\\p{Word_Break=Katakana}"
780
- -
781
- :id: $ALetter
782
- :value: "\\p{Word_Break=ALetter}"
783
- -
784
- :id: $MidLetter
785
- :value: "\\p{Word_Break=MidLetter}"
786
- -
787
- :id: $MidNum
788
- :value: "\\p{Word_Break=MidNum}"
789
- -
790
- :id: $MidNumLet
791
- :value: "\\p{Word_Break=MidNumLet}"
792
- -
793
- :id: $Numeric
794
- :value: "\\p{Word_Break=Numeric}"
795
- -
796
- :id: $ExtendNumLet
797
- :value: "\\p{Word_Break=ExtendNumLet}"
798
- -
799
- :id: $RI
800
- :value: "\\p{Word_Break=Regional_Indicator}"
801
- -
802
- :id: $Hebrew_Letter
803
- :value: "\\p{Word_Break=Hebrew_Letter}"
804
- -
805
- :id: $Double_Quote
806
- :value: "\\p{Word_Break=Double_Quote}"
807
- -
808
- :id: $Single_Quote
809
- :value: "\\p{Word_Break=Single_Quote}"
810
- -
811
- :id: $ZWJ
812
- :value: "\\p{Word_Break=ZWJ}"
813
- -
814
- :id: $ExtPict
815
- :value: "\\p{Extended_Pictographic}"
816
- -
817
- :id: $WSegSpace
818
- :value: "\\p{Word_Break=WSegSpace}"
819
- -
820
- :id: $AHLetter
821
- :value: "($ALetter | $Hebrew_Letter)"
822
- -
823
- :id: $MidNumLetQ
824
- :value: "($MidNumLet | $Single_Quote)"
825
- -
826
- :id: $FE
827
- :value: "[$Format $Extend $ZWJ]"
828
- -
829
- :id: $NotBreak_
830
- :value: "[^ $Newline $CR $LF ]"
831
- -
832
- :id: $Katakana
833
- :value: "($Katakana $FE*)"
834
- -
835
- :id: $ALetter
836
- :value: "($ALetter $FE*)"
837
- -
838
- :id: $MidLetter
839
- :value: "($MidLetter $FE*)"
840
- -
841
- :id: $MidNum
842
- :value: "($MidNum $FE*)"
843
- -
844
- :id: $MidNumLet
845
- :value: "($MidNumLet $FE*)"
846
- -
847
- :id: $Numeric
848
- :value: "($Numeric $FE*)"
849
- -
850
- :id: $ExtendNumLet
851
- :value: "($ExtendNumLet $FE*)"
852
- -
853
- :id: $RI
854
- :value: "($RI $FE*)"
855
- -
856
- :id: $Hebrew_Letter
857
- :value: "($Hebrew_Letter $FE*)"
858
- -
859
- :id: $Double_Quote
860
- :value: "($Double_Quote $FE*)"
861
- -
862
- :id: $Single_Quote
863
- :value: "($Single_Quote $FE*)"
864
- -
865
- :id: $AHLetter
866
- :value: "($AHLetter $FE*)"
867
- -
868
- :id: $MidNumLetQ
869
- :value: "($MidNumLetQ $FE*)"