twitter_cldr 5.1.0 → 5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,12 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Resources
8
- module Uli
9
- autoload :SegmentExceptionsImporter, 'twitter_cldr/resources/uli/segment_exceptions_importer'
10
- end
11
- end
12
- end
@@ -1,59 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- require 'fileutils'
7
- require 'open-uri'
8
- require 'json'
9
-
10
- module TwitterCldr
11
- module Resources
12
- module Uli
13
- class SegmentExceptionsImporter < Resources::Importer
14
-
15
- REPO_URL = 'https://github.com/unicode-org/uli.git'.freeze
16
- GIT_SHA = '6acce954b913b121b6ab4bd4f8395e74dce2ae7c'.freeze
17
-
18
- requirement :git, REPO_URL, GIT_SHA
19
- output_path 'uli/segments'
20
- ruby_engine :mri
21
-
22
- def execute
23
- FileUtils.mkdir_p(output_path)
24
- each_file { |file| import_file(file) }
25
- end
26
-
27
- private
28
-
29
- def output_path
30
- params.fetch(:output_path)
31
- end
32
-
33
- def import_file(file)
34
- locale = File.basename(file).chomp('.json')
35
- output_file = File.join(output_path, "#{locale}.yml")
36
- exceptions = JSON.parse(File.read(file))
37
-
38
- File.open(output_file, 'w:utf-8') do |output|
39
- output.write(
40
- TwitterCldr::Utils::YAML.dump(
41
- TwitterCldr::Utils.deep_symbolize_keys(locale => { exceptions: exceptions['data']['abbrs'] }),
42
- use_natural_symbols: true
43
- )
44
- )
45
- end
46
- end
47
-
48
- def each_file(&block)
49
- Dir.glob(File.join(input_path, 'abbrs', 'json', '*.json')).each(&block)
50
- end
51
-
52
- def input_path
53
- requirements[:git].source_path
54
- end
55
-
56
- end
57
- end
58
- end
59
- end
@@ -1,71 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- class Parser
9
-
10
- def parse(text, options = {})
11
- left_str, boundary_symbol_str, right_str = text.split(/([÷×])/)
12
- boundary_symbol = boundary_symbol_for(boundary_symbol_str)
13
- left = compile_token_list(tokenize_regex(left_str || ''), options)
14
- right = compile_token_list(tokenize_regex(right_str || ''), options)
15
- klass = class_for(boundary_symbol)
16
- klass.new(left, right)
17
- end
18
-
19
- def tokenize_regex(text)
20
- regex_tokenizer.tokenize(text).reject do |token|
21
- token.value.strip.empty?
22
- end
23
- end
24
-
25
- private
26
-
27
- def boundary_symbol_for(str)
28
- case str
29
- when '÷' then :break
30
- when '×' then :no_break
31
- end
32
- end
33
-
34
- def class_for(boundary_symbol)
35
- case boundary_symbol
36
- when :break
37
- BreakRule
38
- when :no_break
39
- NoBreakRule
40
- end
41
- end
42
-
43
- def compile_token_list(token_list, options)
44
- if token_list.empty?
45
- TwitterCldr::Shared::UnicodeRegex.compile('')
46
- else
47
- parse_regex(token_list, options)
48
- end
49
- end
50
-
51
- def parse_regex(tokens, options)
52
- unless tokens.empty?
53
- TwitterCldr::Shared::UnicodeRegex.new(
54
- regex_parser.parse(tokens, options), 'm'
55
- )
56
- end
57
- end
58
-
59
- def regex_tokenizer
60
- @tokenizer ||=
61
- TwitterCldr::Tokenizers::UnicodeRegexTokenizer.new
62
- end
63
-
64
- def regex_parser
65
- @regex_parser ||=
66
- TwitterCldr::Parsers::UnicodeRegexParser.new
67
- end
68
-
69
- end
70
- end
71
- end
@@ -1,79 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- RuleMatchData = Struct.new(
9
- :rule, :boundary_offset, :boundary_position
10
- )
11
-
12
- class Rule
13
-
14
- attr_reader :left, :right
15
- attr_accessor :string, :id
16
-
17
- def initialize(left, right)
18
- @left = left
19
- @right = right
20
- end
21
-
22
- def match(cursor)
23
- left_match = match_side(left, cursor.text, cursor.position)
24
- return nil unless left_match
25
- left_match_offset = offset(left_match, cursor.position)
26
-
27
- right_match = match_side(right, cursor.text, left_match_offset.last)
28
- return nil unless right_match
29
- right_match_offset = offset(right_match, left_match_offset.last)
30
-
31
- offset = [left_match_offset.first, right_match_offset.last]
32
- position = left_match_offset.last
33
-
34
- RuleMatchData.new(self, offset, position)
35
- end
36
-
37
- private
38
-
39
- def offset(match, default)
40
- if match
41
- match.offset(0)
42
- else
43
- [default, default]
44
- end
45
- end
46
-
47
- def match_side(side, text, position)
48
- if side
49
- side_match = side.match(text, position)
50
-
51
- if side_match && side_match.begin(0) == position
52
- side_match
53
- end
54
- end
55
- end
56
- end
57
-
58
- class BreakRule < Rule
59
- def boundary_symbol
60
- :break
61
- end
62
-
63
- def break?
64
- true
65
- end
66
- end
67
-
68
- class NoBreakRule < Rule
69
- def boundary_symbol
70
- :no_break
71
- end
72
-
73
- def break?
74
- false
75
- end
76
- end
77
-
78
- end
79
- end
@@ -1,142 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Segmentation
8
- class RuleSetBuilder
9
-
10
- class << self
11
- def load(locale, boundary_type, options = {})
12
- rules = compile_rules_for(boundary_type)
13
- RuleSet.new(locale, rules, boundary_type, options)
14
- end
15
-
16
- # See the comment above exceptions_for. Basically, we only support exceptions
17
- # for the "sentence" boundary type since the ULI JSON data doesn't distinguish
18
- # between boundary types.
19
- def exception_rule_for(locale, boundary_type)
20
- cache_key = TwitterCldr::Utils.compute_cache_key(locale, boundary_type)
21
- exceptions_cache[cache_key] ||= begin
22
- exceptions = exceptions_for(locale, boundary_type)
23
- regex_contents = exceptions.map { |exc| Regexp.escape(exc) }.join("|")
24
- parse("(?:#{regex_contents}) ×", nil).tap do |rule|
25
- rule.id = 0
26
- end
27
- end
28
- end
29
-
30
- # The implicit final rule is always "Any ÷ Any"
31
- def implicit_final_rule
32
- @implicit_final_rule ||=
33
- parse('. ÷ .', nil).tap do |rule|
34
- rule.id = 9999
35
- end
36
- end
37
-
38
- # The implicit initial rules are always "start-of-text ÷"
39
- # and "÷ end-of-text". We don't need the start-of-text one.
40
- def implicit_end_of_text_rule
41
- @implicit_end_of_text_rule ||=
42
- parse('.\z ÷', nil).tap do |rule|
43
- rule.id = 9998
44
- end
45
- end
46
-
47
- private
48
-
49
- # The boundary_type param is not currently used since the ULI JSON resource that
50
- # exceptions are generated from does not distinguish between boundary types. The
51
- # XML version does, however, so the JSON will hopefully catch up at some point and
52
- # we can make use of this second parameter. For the time being, compile_exception_rule_for
53
- # (which calls this function) assumes a "sentence" boundary type.
54
- def exceptions_for(locale, boundary_type)
55
- exceptions_resource_cache[locale] ||= begin
56
- TwitterCldr.get_resource('uli', 'segments', locale)[locale][:exceptions]
57
- rescue Resources::ResourceLoadError
58
- []
59
- end
60
- end
61
-
62
- def boundary_name_for(str)
63
- str.gsub(/(?:^|\_)([A-Za-z])/) { |s| $1.upcase } + 'Break'
64
- end
65
-
66
- # tokenizes and parses rules from segment_root
67
- def compile_rules_for(boundary_type)
68
- rule_cache[boundary_type] ||= begin
69
- boundary_name = boundary_name_for(boundary_type)
70
- boundary_data = resource_for(boundary_name)
71
- symbol_table = symbol_table_for(boundary_data)
72
- rules_for(boundary_data, symbol_table)
73
- end
74
- end
75
-
76
- def symbol_table_for(boundary_data)
77
- table = TwitterCldr::Parsers::SymbolTable.new
78
- boundary_data[:variables].each do |variable|
79
- id = variable[:id].to_s
80
- tokens = segmentation_parser.tokenize_regex(variable[:value])
81
- # note: variables can be redefined (add replaces if key already exists)
82
- table.add(id, resolve_symbols(tokens, table))
83
- end
84
- table
85
- end
86
-
87
- def resolve_symbols(tokens, symbol_table)
88
- tokens.inject([]) do |ret, token|
89
- if token.type == :variable
90
- ret += symbol_table.fetch(token.value)
91
- else
92
- ret << token
93
- end
94
- ret
95
- end
96
- end
97
-
98
- def rules_for(boundary_data, symbol_table)
99
- boundary_data[:rules].map do |rule|
100
- r = parse(rule[:value], symbol_table)
101
- r.string = rule[:value]
102
- r.id = rule[:id]
103
- r
104
- end
105
- end
106
-
107
- def parse(text, symbol_table)
108
- segmentation_parser.parse(
109
- text, { symbol_table: symbol_table }
110
- )
111
- end
112
-
113
- def resource_for(boundary_name)
114
- root_resource[:segments][boundary_name.to_sym]
115
- end
116
-
117
- def segmentation_parser
118
- @segmentation_parser ||= Segmentation::Parser.new
119
- end
120
-
121
- def root_resource
122
- @root_resource ||= TwitterCldr.get_resource(
123
- 'shared', 'segments', 'segments_root'
124
- )
125
- end
126
-
127
- def rule_cache
128
- @rule_cache ||= {}
129
- end
130
-
131
- def exceptions_resource_cache
132
- @exceptions_resource_cache ||= {}
133
- end
134
-
135
- def exceptions_cache
136
- @exceptions_cache ||= {}
137
- end
138
- end
139
-
140
- end
141
- end
142
- end
@@ -1,869 +0,0 @@
1
- ---
2
- :segments:
3
- :GraphemeClusterBreak:
4
- :rules:
5
- -
6
- :id: 3
7
- :value: " $CR × $LF "
8
- -
9
- :id: 4
10
- :value: " ( $Control | $CR | $LF ) ÷ "
11
- -
12
- :id: 5
13
- :value: " ÷ ( $Control | $CR | $LF ) "
14
- -
15
- :id: 6
16
- :value: " $L × ( $L | $V | $LV | $LVT ) "
17
- -
18
- :id: 7
19
- :value: " ( $LV | $V ) × ( $V | $T ) "
20
- -
21
- :id: 8
22
- :value: " ( $LVT | $T) × $T "
23
- -
24
- :id: 9
25
- :value: " × ($Extend | $ZWJ) "
26
- -
27
- :id: 9.1
28
- :value: " × $SpacingMark "
29
- -
30
- :id: 9.2
31
- :value: " $Prepend × "
32
- -
33
- :id: 9.3
34
- :value: " $LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* × $LinkingConsonant "
35
- -
36
- :id: 11
37
- :value: " $ExtPict $Extend* $ZWJ × $ExtPict "
38
- -
39
- :id: 12
40
- :value: " ^ ($RI $RI)* $RI × $RI "
41
- -
42
- :id: 13
43
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
44
- :variables:
45
- -
46
- :id: $CR
47
- :value: "\\p{Grapheme_Cluster_Break=CR}"
48
- -
49
- :id: $LF
50
- :value: "\\p{Grapheme_Cluster_Break=LF}"
51
- -
52
- :id: $Control
53
- :value: "\\p{Grapheme_Cluster_Break=Control}"
54
- -
55
- :id: $Extend
56
- :value: "\\p{Grapheme_Cluster_Break=Extend}"
57
- -
58
- :id: $ZWJ
59
- :value: "\\p{Grapheme_Cluster_Break=ZWJ}"
60
- -
61
- :id: $RI
62
- :value: "\\p{Grapheme_Cluster_Break=Regional_Indicator}"
63
- -
64
- :id: $Prepend
65
- :value: "\\p{Grapheme_Cluster_Break=Prepend}"
66
- -
67
- :id: $SpacingMark
68
- :value: "\\p{Grapheme_Cluster_Break=SpacingMark}"
69
- -
70
- :id: $L
71
- :value: "\\p{Grapheme_Cluster_Break=L}"
72
- -
73
- :id: $V
74
- :value: "\\p{Grapheme_Cluster_Break=V}"
75
- -
76
- :id: $T
77
- :value: "\\p{Grapheme_Cluster_Break=T}"
78
- -
79
- :id: $LV
80
- :value: "\\p{Grapheme_Cluster_Break=LV}"
81
- -
82
- :id: $LVT
83
- :value: "\\p{Grapheme_Cluster_Break=LVT}"
84
- -
85
- :id: $Virama
86
- :value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]"
87
- -
88
- :id: $LinkingConsonant
89
- :value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]"
90
- -
91
- :id: $ExtPict
92
- :value: "\\p{Extended_Pictographic}"
93
- -
94
- :id: $ExtCccZwj
95
- :value: "[[$Extend-\\p{ccc=0}] $ZWJ]"
96
- :LineBreak:
97
- :rules:
98
- -
99
- :id: 4
100
- :value: " $BK ÷ "
101
- -
102
- :id: 5.01
103
- :value: " $CR × $LF "
104
- -
105
- :id: 5.02
106
- :value: " $CR ÷ "
107
- -
108
- :id: 5.03
109
- :value: " $LF ÷ "
110
- -
111
- :id: 5.04
112
- :value: " $NL ÷ "
113
- -
114
- :id: 6
115
- :value: " × ( $BK | $CR | $LF | $NL ) "
116
- -
117
- :id: 7.01
118
- :value: " × $SP "
119
- -
120
- :id: 7.02
121
- :value: " × $ZW "
122
- -
123
- :id: 8
124
- :value: " $ZW $SP* ÷ "
125
- -
126
- :id: 8.1
127
- :value: " $ZWJ_O × "
128
- -
129
- :id: 9
130
- :value: " $Spec2_ × $CM "
131
- -
132
- :id: 11.01
133
- :value: " × $WJ "
134
- -
135
- :id: 11.02
136
- :value: " $WJ × "
137
- -
138
- :id: 12
139
- :value: " $GL × "
140
- -
141
- :id: 12.1
142
- :value: " $Spec3a_ × $GL "
143
- -
144
- :id: 12.2
145
- :value: " $Spec3b_ $CM+ × $GL "
146
- -
147
- :id: 12.3
148
- :value: " ^ $CM+ × $GL "
149
- -
150
- :id: 13.01
151
- :value: " × $EX "
152
- -
153
- :id: 13.02
154
- :value: " $Spec4_ × ($CL | $CP | $IS | $SY) "
155
- -
156
- :id: 13.03
157
- :value: " $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) "
158
- -
159
- :id: 13.04
160
- :value: " ^ $CM+ × ($CL | $CP | $IS | $SY) "
161
- -
162
- :id: 14
163
- :value: " $OP $SP* × "
164
- -
165
- :id: 15
166
- :value: " $QU $SP* × $OP "
167
- -
168
- :id: 16
169
- :value: " ($CL | $CP) $SP* × $NS "
170
- -
171
- :id: 17
172
- :value: " $B2 $SP* × $B2 "
173
- -
174
- :id: 18
175
- :value: " $SP ÷ "
176
- -
177
- :id: 19.01
178
- :value: " × $QU "
179
- -
180
- :id: 19.02
181
- :value: " $QU × "
182
- -
183
- :id: 20.01
184
- :value: " ÷ $CB "
185
- -
186
- :id: 20.02
187
- :value: " $CB ÷ "
188
- -
189
- :id: 20.09
190
- :value: " $Spec5_ $HY × $AL "
191
- -
192
- :id: 21.01
193
- :value: " × $BA "
194
- -
195
- :id: 21.02
196
- :value: " × $HY "
197
- -
198
- :id: 21.03
199
- :value: " × $NS "
200
- -
201
- :id: 21.04
202
- :value: " $BB × "
203
- -
204
- :id: 21.1
205
- :value: " $HL ($HY | $BA) × "
206
- -
207
- :id: 21.2
208
- :value: " $SY × $HL "
209
- -
210
- :id: 22.01
211
- :value: " ($AL | $HL) × $IN "
212
- -
213
- :id: 22.02
214
- :value: " $EX × $IN "
215
- -
216
- :id: 22.03
217
- :value: " ($ID | $EB | $EM) × $IN "
218
- -
219
- :id: 22.04
220
- :value: " $IN × $IN "
221
- -
222
- :id: 22.05
223
- :value: " $NU × $IN "
224
- -
225
- :id: 23.02
226
- :value: " ($AL | $HL) × $NU "
227
- -
228
- :id: 23.03
229
- :value: " $NU × ($AL | $HL) "
230
- -
231
- :id: 23.12
232
- :value: " $PR × ($ID | $EB | $EM) "
233
- -
234
- :id: 23.13
235
- :value: " ($ID | $EB | $EM) × $PO "
236
- -
237
- :id: 24.02
238
- :value: " ($PR | $PO) × ($AL | $HL) "
239
- -
240
- :id: 24.03
241
- :value: " ($AL | $HL) × ($PR | $PO) "
242
- -
243
- :id: 25.01
244
- :value: " ($PR | $PO) × ( $OP | $HY )? $NU "
245
- -
246
- :id: 25.02
247
- :value: " ( $OP | $HY ) × $NU "
248
- -
249
- :id: 25.03
250
- :value: " $NU × ($NU | $SY | $IS) "
251
- -
252
- :id: 25.04
253
- :value: " $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) "
254
- -
255
- :id: 25.05
256
- :value: " $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) "
257
- -
258
- :id: 26.01
259
- :value: " $JL × $JL | $JV | $H2 | $H3 "
260
- -
261
- :id: 26.02
262
- :value: " $JV | $H2 × $JV | $JT "
263
- -
264
- :id: 26.03
265
- :value: " $JT | $H3 × $JT "
266
- -
267
- :id: 27.01
268
- :value: " $JL | $JV | $JT | $H2 | $H3 × $IN "
269
- -
270
- :id: 27.02
271
- :value: " $JL | $JV | $JT | $H2 | $H3 × $PO "
272
- -
273
- :id: 27.03
274
- :value: " $PR × $JL | $JV | $JT | $H2 | $H3 "
275
- -
276
- :id: 28
277
- :value: " ($AL | $HL) × ($AL | $HL) "
278
- -
279
- :id: 29
280
- :value: " $IS × ($AL | $HL) "
281
- -
282
- :id: 30.01
283
- :value: " ($AL | $HL | $NU) × $OP "
284
- -
285
- :id: 30.02
286
- :value: " $CP × ($AL | $HL | $NU) "
287
- -
288
- :id: 30.11
289
- :value: " ^ ($RI $RI)* $RI × $RI "
290
- -
291
- :id: 30.12
292
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
293
- -
294
- :id: 30.13
295
- :value: " $RI ÷ $RI "
296
- -
297
- :id: 30.2
298
- :value: " $EB × $EM "
299
- :variables:
300
- -
301
- :id: $AI
302
- :value: "\\p{Line_Break=Ambiguous}"
303
- -
304
- :id: $AL
305
- :value: "\\p{Line_Break=Alphabetic}"
306
- -
307
- :id: $B2
308
- :value: "\\p{Line_Break=Break_Both}"
309
- -
310
- :id: $BA
311
- :value: "\\p{Line_Break=Break_After}"
312
- -
313
- :id: $BB
314
- :value: "\\p{Line_Break=Break_Before}"
315
- -
316
- :id: $BK
317
- :value: "\\p{Line_Break=Mandatory_Break}"
318
- -
319
- :id: $CB
320
- :value: "\\p{Line_Break=Contingent_Break}"
321
- -
322
- :id: $CL
323
- :value: "\\p{Line_Break=Close_Punctuation}"
324
- -
325
- :id: $CP
326
- :value: "\\p{Line_Break=CP}"
327
- -
328
- :id: $CM1
329
- :value: "\\p{Line_Break=Combining_Mark}"
330
- -
331
- :id: $CR
332
- :value: "\\p{Line_Break=Carriage_Return}"
333
- -
334
- :id: $EX
335
- :value: "\\p{Line_Break=Exclamation}"
336
- -
337
- :id: $GL
338
- :value: "\\p{Line_Break=Glue}"
339
- -
340
- :id: $H2
341
- :value: "\\p{Line_Break=H2}"
342
- -
343
- :id: $H3
344
- :value: "\\p{Line_Break=H3}"
345
- -
346
- :id: $HL
347
- :value: "\\p{Line_Break=HL}"
348
- -
349
- :id: $HY
350
- :value: "\\p{Line_Break=Hyphen}"
351
- -
352
- :id: $ID
353
- :value: "\\p{Line_Break=Ideographic}"
354
- -
355
- :id: $IN
356
- :value: "\\p{Line_Break=Inseparable}"
357
- -
358
- :id: $IS
359
- :value: "\\p{Line_Break=Infix_Numeric}"
360
- -
361
- :id: $JL
362
- :value: "\\p{Line_Break=JL}"
363
- -
364
- :id: $JT
365
- :value: "\\p{Line_Break=JT}"
366
- -
367
- :id: $JV
368
- :value: "\\p{Line_Break=JV}"
369
- -
370
- :id: $LF
371
- :value: "\\p{Line_Break=Line_Feed}"
372
- -
373
- :id: $NL
374
- :value: "\\p{Line_Break=Next_Line}"
375
- -
376
- :id: $NS
377
- :value: "\\p{Line_Break=Nonstarter}"
378
- -
379
- :id: $NU
380
- :value: "\\p{Line_Break=Numeric}"
381
- -
382
- :id: $OP
383
- :value: "\\p{Line_Break=Open_Punctuation}"
384
- -
385
- :id: $PO
386
- :value: "\\p{Line_Break=Postfix_Numeric}"
387
- -
388
- :id: $PR
389
- :value: "\\p{Line_Break=Prefix_Numeric}"
390
- -
391
- :id: $QU
392
- :value: "\\p{Line_Break=Quotation}"
393
- -
394
- :id: $SA
395
- :value: "\\p{Line_Break=Complex_Context}"
396
- -
397
- :id: $SG
398
- :value: "\\p{Line_Break=Surrogate}"
399
- -
400
- :id: $SP
401
- :value: "\\p{Line_Break=Space}"
402
- -
403
- :id: $SY
404
- :value: "\\p{Line_Break=Break_Symbols}"
405
- -
406
- :id: $WJ
407
- :value: "\\p{Line_Break=Word_Joiner}"
408
- -
409
- :id: $XX
410
- :value: "\\p{Line_Break=Unknown}"
411
- -
412
- :id: $ZW
413
- :value: "\\p{Line_Break=ZWSpace}"
414
- -
415
- :id: $CJ
416
- :value: "\\p{Line_Break=Conditional_Japanese_Starter}"
417
- -
418
- :id: $RI
419
- :value: "\\p{Line_Break=Regional_Indicator}"
420
- -
421
- :id: $EB
422
- :value: "\\p{Line_Break=E_Base}"
423
- -
424
- :id: $EM
425
- :value: "\\p{Line_Break=E_Modifier}"
426
- -
427
- :id: $ZWJ_O
428
- :value: "\\p{Line_Break=ZWJ}"
429
- -
430
- :id: $ZWJ
431
- :value: "\\p{Line_Break=ZWJ}"
432
- -
433
- :id: $CM
434
- :value: "[$CM1 $ZWJ]"
435
- -
436
- :id: $AL
437
- :value: "[$AI $AL $SG $XX $SA]"
438
- -
439
- :id: $NS
440
- :value: "[$NS $CJ]"
441
- -
442
- :id: $X
443
- :value: $CM*
444
- -
445
- :id: $Spec1_
446
- :value: "[$SP $BK $CR $LF $NL $ZW]"
447
- -
448
- :id: $Spec2_
449
- :value: "[^ $SP $BK $CR $LF $NL $ZW]"
450
- -
451
- :id: $Spec3a_
452
- :value: "[^ $SP $BA $HY $CM]"
453
- -
454
- :id: $Spec3b_
455
- :value: "[^ $BA $HY $CM]"
456
- -
457
- :id: $Spec4_
458
- :value: "[^ $NU $CM]"
459
- -
460
- :id: $Spec5_
461
- :value: "[$BK $CB $CR $LF $NL $SP $ZW]"
462
- -
463
- :id: $AI
464
- :value: "($AI $X)"
465
- -
466
- :id: $AL
467
- :value: "($AL $X)"
468
- -
469
- :id: $B2
470
- :value: "($B2 $X)"
471
- -
472
- :id: $BA
473
- :value: "($BA $X)"
474
- -
475
- :id: $BB
476
- :value: "($BB $X)"
477
- -
478
- :id: $CB
479
- :value: "($CB $X)"
480
- -
481
- :id: $CL
482
- :value: "($CL $X)"
483
- -
484
- :id: $CP
485
- :value: "($CP $X)"
486
- -
487
- :id: $CM
488
- :value: "($CM $X)"
489
- -
490
- :id: $EX
491
- :value: "($EX $X)"
492
- -
493
- :id: $GL
494
- :value: "($GL $X)"
495
- -
496
- :id: $H2
497
- :value: "($H2 $X)"
498
- -
499
- :id: $H3
500
- :value: "($H3 $X)"
501
- -
502
- :id: $HL
503
- :value: "($HL $X)"
504
- -
505
- :id: $HY
506
- :value: "($HY $X)"
507
- -
508
- :id: $ID
509
- :value: "($ID $X)"
510
- -
511
- :id: $IN
512
- :value: "($IN $X)"
513
- -
514
- :id: $IS
515
- :value: "($IS $X)"
516
- -
517
- :id: $JL
518
- :value: "($JL $X)"
519
- -
520
- :id: $JT
521
- :value: "($JT $X)"
522
- -
523
- :id: $JV
524
- :value: "($JV $X)"
525
- -
526
- :id: $NS
527
- :value: "($NS $X)"
528
- -
529
- :id: $NU
530
- :value: "($NU $X)"
531
- -
532
- :id: $OP
533
- :value: "($OP $X)"
534
- -
535
- :id: $PO
536
- :value: "($PO $X)"
537
- -
538
- :id: $PR
539
- :value: "($PR $X)"
540
- -
541
- :id: $QU
542
- :value: "($QU $X)"
543
- -
544
- :id: $SA
545
- :value: "($SA $X)"
546
- -
547
- :id: $SG
548
- :value: "($SG $X)"
549
- -
550
- :id: $SY
551
- :value: "($SY $X)"
552
- -
553
- :id: $WJ
554
- :value: "($WJ $X)"
555
- -
556
- :id: $XX
557
- :value: "($XX $X)"
558
- -
559
- :id: $RI
560
- :value: "($RI $X)"
561
- -
562
- :id: $EB
563
- :value: "($EB $X)"
564
- -
565
- :id: $EM
566
- :value: "($EM $X)"
567
- -
568
- :id: $ZWJ
569
- :value: "($ZWJ $X)"
570
- -
571
- :id: $AL
572
- :value: "($AL | ^ $CM | (?<=$Spec1_) $CM)"
573
- :SentenceBreak:
574
- :rules:
575
- -
576
- :id: 3
577
- :value: " $CR × $LF "
578
- -
579
- :id: 4
580
- :value: " $ParaSep ÷ "
581
- -
582
- :id: 5
583
- :value: " × [$Format $Extend] "
584
- -
585
- :id: 6
586
- :value: " $ATerm × $Numeric "
587
- -
588
- :id: 7
589
- :value: " ($Upper | $Lower) $ATerm × $Upper "
590
- -
591
- :id: 8
592
- :value: " $ATerm $Close* $Sp* × $NotPreLower_* $Lower "
593
- -
594
- :id: 8.1
595
- :value: " $SATerm $Close* $Sp* × ($SContinue | $SATerm) "
596
- -
597
- :id: 9
598
- :value: " $SATerm $Close* × ( $Close | $Sp | $ParaSep ) "
599
- -
600
- :id: 10
601
- :value: " $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) "
602
- -
603
- :id: 11
604
- :value: " $SATerm $Close* $Sp* $ParaSep? ÷ "
605
- -
606
- :id: 998
607
- :value: " × $Any "
608
- :variables:
609
- -
610
- :id: $CR
611
- :value: "\\p{Sentence_Break=CR}"
612
- -
613
- :id: $LF
614
- :value: "\\p{Sentence_Break=LF}"
615
- -
616
- :id: $Extend
617
- :value: "\\p{Sentence_Break=Extend}"
618
- -
619
- :id: $Format
620
- :value: "\\p{Sentence_Break=Format}"
621
- -
622
- :id: $Sep
623
- :value: "\\p{Sentence_Break=Sep}"
624
- -
625
- :id: $Sp
626
- :value: "\\p{Sentence_Break=Sp}"
627
- -
628
- :id: $Lower
629
- :value: "\\p{Sentence_Break=Lower}"
630
- -
631
- :id: $Upper
632
- :value: "\\p{Sentence_Break=Upper}"
633
- -
634
- :id: $OLetter
635
- :value: "\\p{Sentence_Break=OLetter}"
636
- -
637
- :id: $Numeric
638
- :value: "\\p{Sentence_Break=Numeric}"
639
- -
640
- :id: $ATerm
641
- :value: "\\p{Sentence_Break=ATerm}"
642
- -
643
- :id: $STerm
644
- :value: "\\p{Sentence_Break=STerm}"
645
- -
646
- :id: $Close
647
- :value: "\\p{Sentence_Break=Close}"
648
- -
649
- :id: $SContinue
650
- :value: "\\p{Sentence_Break=SContinue}"
651
- -
652
- :id: $Any
653
- :value: "."
654
- -
655
- :id: $FE
656
- :value: "[$Format $Extend]"
657
- -
658
- :id: $NotPreLower_
659
- :value: "[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]"
660
- -
661
- :id: $Sp
662
- :value: "($Sp $FE*)"
663
- -
664
- :id: $Lower
665
- :value: "($Lower $FE*)"
666
- -
667
- :id: $Upper
668
- :value: "($Upper $FE*)"
669
- -
670
- :id: $OLetter
671
- :value: "($OLetter $FE*)"
672
- -
673
- :id: $Numeric
674
- :value: "($Numeric $FE*)"
675
- -
676
- :id: $ATerm
677
- :value: "($ATerm $FE*)"
678
- -
679
- :id: $STerm
680
- :value: "($STerm $FE*)"
681
- -
682
- :id: $Close
683
- :value: "($Close $FE*)"
684
- -
685
- :id: $SContinue
686
- :value: "($SContinue $FE*)"
687
- -
688
- :id: $ParaSep
689
- :value: "($Sep | $CR | $LF)"
690
- -
691
- :id: $SATerm
692
- :value: "($STerm | $ATerm)"
693
- :WordBreak:
694
- :rules:
695
- -
696
- :id: 3
697
- :value: " $CR × $LF "
698
- -
699
- :id: 3.1
700
- :value: " ($Newline | $CR | $LF) ÷ "
701
- -
702
- :id: 3.2
703
- :value: " ÷ ($Newline | $CR | $LF) "
704
- -
705
- :id: 3.3
706
- :value: " $ZWJ × $ExtPict "
707
- -
708
- :id: 3.4
709
- :value: " $WSegSpace × $WSegSpace "
710
- -
711
- :id: 4
712
- :value: " $NotBreak_ × [$Format $Extend $ZWJ] "
713
- -
714
- :id: 5
715
- :value: " $AHLetter × $AHLetter "
716
- -
717
- :id: 6
718
- :value: " $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter "
719
- -
720
- :id: 7
721
- :value: " $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter "
722
- -
723
- :id: 7.1
724
- :value: " $Hebrew_Letter × $Single_Quote "
725
- -
726
- :id: 7.2
727
- :value: " $Hebrew_Letter × $Double_Quote $Hebrew_Letter "
728
- -
729
- :id: 7.3
730
- :value: " $Hebrew_Letter $Double_Quote × $Hebrew_Letter "
731
- -
732
- :id: 8
733
- :value: " $Numeric × $Numeric "
734
- -
735
- :id: 9
736
- :value: " $AHLetter × $Numeric "
737
- -
738
- :id: 10
739
- :value: " $Numeric × $AHLetter "
740
- -
741
- :id: 11
742
- :value: " $Numeric ($MidNum | $MidNumLetQ) × $Numeric "
743
- -
744
- :id: 12
745
- :value: " $Numeric × ($MidNum | $MidNumLetQ) $Numeric "
746
- -
747
- :id: 13
748
- :value: " $Katakana × $Katakana "
749
- -
750
- :id: 13.1
751
- :value: " ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet "
752
- -
753
- :id: 13.2
754
- :value: " $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) "
755
- -
756
- :id: 15
757
- :value: " ^ ($RI $RI)* $RI × $RI "
758
- -
759
- :id: 16
760
- :value: " [^$RI] ($RI $RI)* $RI × $RI "
761
- :variables:
762
- -
763
- :id: $CR
764
- :value: "\\p{Word_Break=CR}"
765
- -
766
- :id: $LF
767
- :value: "\\p{Word_Break=LF}"
768
- -
769
- :id: $Newline
770
- :value: "\\p{Word_Break=Newline}"
771
- -
772
- :id: $Extend
773
- :value: "\\p{Word_Break=Extend}"
774
- -
775
- :id: $Format
776
- :value: "\\p{Word_Break=Format}"
777
- -
778
- :id: $Katakana
779
- :value: "\\p{Word_Break=Katakana}"
780
- -
781
- :id: $ALetter
782
- :value: "\\p{Word_Break=ALetter}"
783
- -
784
- :id: $MidLetter
785
- :value: "\\p{Word_Break=MidLetter}"
786
- -
787
- :id: $MidNum
788
- :value: "\\p{Word_Break=MidNum}"
789
- -
790
- :id: $MidNumLet
791
- :value: "\\p{Word_Break=MidNumLet}"
792
- -
793
- :id: $Numeric
794
- :value: "\\p{Word_Break=Numeric}"
795
- -
796
- :id: $ExtendNumLet
797
- :value: "\\p{Word_Break=ExtendNumLet}"
798
- -
799
- :id: $RI
800
- :value: "\\p{Word_Break=Regional_Indicator}"
801
- -
802
- :id: $Hebrew_Letter
803
- :value: "\\p{Word_Break=Hebrew_Letter}"
804
- -
805
- :id: $Double_Quote
806
- :value: "\\p{Word_Break=Double_Quote}"
807
- -
808
- :id: $Single_Quote
809
- :value: "\\p{Word_Break=Single_Quote}"
810
- -
811
- :id: $ZWJ
812
- :value: "\\p{Word_Break=ZWJ}"
813
- -
814
- :id: $ExtPict
815
- :value: "\\p{Extended_Pictographic}"
816
- -
817
- :id: $WSegSpace
818
- :value: "\\p{Word_Break=WSegSpace}"
819
- -
820
- :id: $AHLetter
821
- :value: "($ALetter | $Hebrew_Letter)"
822
- -
823
- :id: $MidNumLetQ
824
- :value: "($MidNumLet | $Single_Quote)"
825
- -
826
- :id: $FE
827
- :value: "[$Format $Extend $ZWJ]"
828
- -
829
- :id: $NotBreak_
830
- :value: "[^ $Newline $CR $LF ]"
831
- -
832
- :id: $Katakana
833
- :value: "($Katakana $FE*)"
834
- -
835
- :id: $ALetter
836
- :value: "($ALetter $FE*)"
837
- -
838
- :id: $MidLetter
839
- :value: "($MidLetter $FE*)"
840
- -
841
- :id: $MidNum
842
- :value: "($MidNum $FE*)"
843
- -
844
- :id: $MidNumLet
845
- :value: "($MidNumLet $FE*)"
846
- -
847
- :id: $Numeric
848
- :value: "($Numeric $FE*)"
849
- -
850
- :id: $ExtendNumLet
851
- :value: "($ExtendNumLet $FE*)"
852
- -
853
- :id: $RI
854
- :value: "($RI $FE*)"
855
- -
856
- :id: $Hebrew_Letter
857
- :value: "($Hebrew_Letter $FE*)"
858
- -
859
- :id: $Double_Quote
860
- :value: "($Double_Quote $FE*)"
861
- -
862
- :id: $Single_Quote
863
- :value: "($Single_Quote $FE*)"
864
- -
865
- :id: $AHLetter
866
- :value: "($AHLetter $FE*)"
867
- -
868
- :id: $MidNumLetQ
869
- :value: "($MidNumLetQ $FE*)"