twitter_cldr 5.1.0 → 5.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +5 -5
- data/lib/twitter_cldr.rb +1 -0
- data/lib/twitter_cldr/resources.rb +2 -8
- data/lib/twitter_cldr/resources/loader.rb +6 -4
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
- data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
- data/lib/twitter_cldr/segmentation.rb +10 -8
- data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
- data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
- data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
- data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
- data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
- data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
- data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
- data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
- data/lib/twitter_cldr/shared/caser.rb +1 -1
- data/lib/twitter_cldr/shared/locale.rb +6 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/segments/rules/el/sentence.yml +723 -0
- data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
- data/resources/shared/segments/rules/ja/line.yml +964 -0
- data/resources/shared/segments/rules/ja/word.yml +527 -0
- data/resources/shared/segments/rules/root/grapheme.yml +463 -0
- data/resources/shared/segments/rules/root/line.yml +964 -0
- data/resources/shared/segments/rules/root/sentence.yml +723 -0
- data/resources/shared/segments/rules/root/word.yml +527 -0
- data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
- data/resources/shared/segments/rules/zh/line.yml +964 -0
- data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
- data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
- data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
- data/resources/shared/segments/tests/line_break_test.yml +7348 -0
- data/resources/uli/segments/de.yml +5 -230
- data/resources/uli/segments/en.yml +3 -154
- data/resources/uli/segments/es.yml +5 -145
- data/resources/uli/segments/fr.yml +5 -68
- data/resources/uli/segments/it.yml +3 -48
- data/resources/uli/segments/pt.yml +5 -173
- data/resources/uli/segments/ru.yml +3 -10
- data/spec/segmentation/rule_set_spec.rb +54 -27
- metadata +29 -9
- data/lib/twitter_cldr/resources/uli.rb +0 -12
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
- data/lib/twitter_cldr/segmentation/parser.rb +0 -71
- data/lib/twitter_cldr/segmentation/rule.rb +0 -79
- data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
- data/resources/shared/segments/segments_root.yml +0 -869
- data/spec/segmentation/parser_spec.rb +0 -104
@@ -1,12 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
module TwitterCldr
|
7
|
-
module Resources
|
8
|
-
module Uli
|
9
|
-
autoload :SegmentExceptionsImporter, 'twitter_cldr/resources/uli/segment_exceptions_importer'
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
@@ -1,59 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'fileutils'
|
7
|
-
require 'open-uri'
|
8
|
-
require 'json'
|
9
|
-
|
10
|
-
module TwitterCldr
|
11
|
-
module Resources
|
12
|
-
module Uli
|
13
|
-
class SegmentExceptionsImporter < Resources::Importer
|
14
|
-
|
15
|
-
REPO_URL = 'https://github.com/unicode-org/uli.git'.freeze
|
16
|
-
GIT_SHA = '6acce954b913b121b6ab4bd4f8395e74dce2ae7c'.freeze
|
17
|
-
|
18
|
-
requirement :git, REPO_URL, GIT_SHA
|
19
|
-
output_path 'uli/segments'
|
20
|
-
ruby_engine :mri
|
21
|
-
|
22
|
-
def execute
|
23
|
-
FileUtils.mkdir_p(output_path)
|
24
|
-
each_file { |file| import_file(file) }
|
25
|
-
end
|
26
|
-
|
27
|
-
private
|
28
|
-
|
29
|
-
def output_path
|
30
|
-
params.fetch(:output_path)
|
31
|
-
end
|
32
|
-
|
33
|
-
def import_file(file)
|
34
|
-
locale = File.basename(file).chomp('.json')
|
35
|
-
output_file = File.join(output_path, "#{locale}.yml")
|
36
|
-
exceptions = JSON.parse(File.read(file))
|
37
|
-
|
38
|
-
File.open(output_file, 'w:utf-8') do |output|
|
39
|
-
output.write(
|
40
|
-
TwitterCldr::Utils::YAML.dump(
|
41
|
-
TwitterCldr::Utils.deep_symbolize_keys(locale => { exceptions: exceptions['data']['abbrs'] }),
|
42
|
-
use_natural_symbols: true
|
43
|
-
)
|
44
|
-
)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def each_file(&block)
|
49
|
-
Dir.glob(File.join(input_path, 'abbrs', 'json', '*.json')).each(&block)
|
50
|
-
end
|
51
|
-
|
52
|
-
def input_path
|
53
|
-
requirements[:git].source_path
|
54
|
-
end
|
55
|
-
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
module TwitterCldr
|
7
|
-
module Segmentation
|
8
|
-
class Parser
|
9
|
-
|
10
|
-
def parse(text, options = {})
|
11
|
-
left_str, boundary_symbol_str, right_str = text.split(/([÷×])/)
|
12
|
-
boundary_symbol = boundary_symbol_for(boundary_symbol_str)
|
13
|
-
left = compile_token_list(tokenize_regex(left_str || ''), options)
|
14
|
-
right = compile_token_list(tokenize_regex(right_str || ''), options)
|
15
|
-
klass = class_for(boundary_symbol)
|
16
|
-
klass.new(left, right)
|
17
|
-
end
|
18
|
-
|
19
|
-
def tokenize_regex(text)
|
20
|
-
regex_tokenizer.tokenize(text).reject do |token|
|
21
|
-
token.value.strip.empty?
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def boundary_symbol_for(str)
|
28
|
-
case str
|
29
|
-
when '÷' then :break
|
30
|
-
when '×' then :no_break
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def class_for(boundary_symbol)
|
35
|
-
case boundary_symbol
|
36
|
-
when :break
|
37
|
-
BreakRule
|
38
|
-
when :no_break
|
39
|
-
NoBreakRule
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def compile_token_list(token_list, options)
|
44
|
-
if token_list.empty?
|
45
|
-
TwitterCldr::Shared::UnicodeRegex.compile('')
|
46
|
-
else
|
47
|
-
parse_regex(token_list, options)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
def parse_regex(tokens, options)
|
52
|
-
unless tokens.empty?
|
53
|
-
TwitterCldr::Shared::UnicodeRegex.new(
|
54
|
-
regex_parser.parse(tokens, options), 'm'
|
55
|
-
)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def regex_tokenizer
|
60
|
-
@tokenizer ||=
|
61
|
-
TwitterCldr::Tokenizers::UnicodeRegexTokenizer.new
|
62
|
-
end
|
63
|
-
|
64
|
-
def regex_parser
|
65
|
-
@regex_parser ||=
|
66
|
-
TwitterCldr::Parsers::UnicodeRegexParser.new
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
@@ -1,79 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
module TwitterCldr
|
7
|
-
module Segmentation
|
8
|
-
RuleMatchData = Struct.new(
|
9
|
-
:rule, :boundary_offset, :boundary_position
|
10
|
-
)
|
11
|
-
|
12
|
-
class Rule
|
13
|
-
|
14
|
-
attr_reader :left, :right
|
15
|
-
attr_accessor :string, :id
|
16
|
-
|
17
|
-
def initialize(left, right)
|
18
|
-
@left = left
|
19
|
-
@right = right
|
20
|
-
end
|
21
|
-
|
22
|
-
def match(cursor)
|
23
|
-
left_match = match_side(left, cursor.text, cursor.position)
|
24
|
-
return nil unless left_match
|
25
|
-
left_match_offset = offset(left_match, cursor.position)
|
26
|
-
|
27
|
-
right_match = match_side(right, cursor.text, left_match_offset.last)
|
28
|
-
return nil unless right_match
|
29
|
-
right_match_offset = offset(right_match, left_match_offset.last)
|
30
|
-
|
31
|
-
offset = [left_match_offset.first, right_match_offset.last]
|
32
|
-
position = left_match_offset.last
|
33
|
-
|
34
|
-
RuleMatchData.new(self, offset, position)
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
|
39
|
-
def offset(match, default)
|
40
|
-
if match
|
41
|
-
match.offset(0)
|
42
|
-
else
|
43
|
-
[default, default]
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def match_side(side, text, position)
|
48
|
-
if side
|
49
|
-
side_match = side.match(text, position)
|
50
|
-
|
51
|
-
if side_match && side_match.begin(0) == position
|
52
|
-
side_match
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
class BreakRule < Rule
|
59
|
-
def boundary_symbol
|
60
|
-
:break
|
61
|
-
end
|
62
|
-
|
63
|
-
def break?
|
64
|
-
true
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
class NoBreakRule < Rule
|
69
|
-
def boundary_symbol
|
70
|
-
:no_break
|
71
|
-
end
|
72
|
-
|
73
|
-
def break?
|
74
|
-
false
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
end
|
79
|
-
end
|
@@ -1,142 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
module TwitterCldr
|
7
|
-
module Segmentation
|
8
|
-
class RuleSetBuilder
|
9
|
-
|
10
|
-
class << self
|
11
|
-
def load(locale, boundary_type, options = {})
|
12
|
-
rules = compile_rules_for(boundary_type)
|
13
|
-
RuleSet.new(locale, rules, boundary_type, options)
|
14
|
-
end
|
15
|
-
|
16
|
-
# See the comment above exceptions_for. Basically, we only support exceptions
|
17
|
-
# for the "sentence" boundary type since the ULI JSON data doesn't distinguish
|
18
|
-
# between boundary types.
|
19
|
-
def exception_rule_for(locale, boundary_type)
|
20
|
-
cache_key = TwitterCldr::Utils.compute_cache_key(locale, boundary_type)
|
21
|
-
exceptions_cache[cache_key] ||= begin
|
22
|
-
exceptions = exceptions_for(locale, boundary_type)
|
23
|
-
regex_contents = exceptions.map { |exc| Regexp.escape(exc) }.join("|")
|
24
|
-
parse("(?:#{regex_contents}) ×", nil).tap do |rule|
|
25
|
-
rule.id = 0
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
# The implicit final rule is always "Any ÷ Any"
|
31
|
-
def implicit_final_rule
|
32
|
-
@implicit_final_rule ||=
|
33
|
-
parse('. ÷ .', nil).tap do |rule|
|
34
|
-
rule.id = 9999
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# The implicit initial rules are always "start-of-text ÷"
|
39
|
-
# and "÷ end-of-text". We don't need the start-of-text one.
|
40
|
-
def implicit_end_of_text_rule
|
41
|
-
@implicit_end_of_text_rule ||=
|
42
|
-
parse('.\z ÷', nil).tap do |rule|
|
43
|
-
rule.id = 9998
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
private
|
48
|
-
|
49
|
-
# The boundary_type param is not currently used since the ULI JSON resource that
|
50
|
-
# exceptions are generated from does not distinguish between boundary types. The
|
51
|
-
# XML version does, however, so the JSON will hopefully catch up at some point and
|
52
|
-
# we can make use of this second parameter. For the time being, compile_exception_rule_for
|
53
|
-
# (which calls this function) assumes a "sentence" boundary type.
|
54
|
-
def exceptions_for(locale, boundary_type)
|
55
|
-
exceptions_resource_cache[locale] ||= begin
|
56
|
-
TwitterCldr.get_resource('uli', 'segments', locale)[locale][:exceptions]
|
57
|
-
rescue Resources::ResourceLoadError
|
58
|
-
[]
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
def boundary_name_for(str)
|
63
|
-
str.gsub(/(?:^|\_)([A-Za-z])/) { |s| $1.upcase } + 'Break'
|
64
|
-
end
|
65
|
-
|
66
|
-
# tokenizes and parses rules from segment_root
|
67
|
-
def compile_rules_for(boundary_type)
|
68
|
-
rule_cache[boundary_type] ||= begin
|
69
|
-
boundary_name = boundary_name_for(boundary_type)
|
70
|
-
boundary_data = resource_for(boundary_name)
|
71
|
-
symbol_table = symbol_table_for(boundary_data)
|
72
|
-
rules_for(boundary_data, symbol_table)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def symbol_table_for(boundary_data)
|
77
|
-
table = TwitterCldr::Parsers::SymbolTable.new
|
78
|
-
boundary_data[:variables].each do |variable|
|
79
|
-
id = variable[:id].to_s
|
80
|
-
tokens = segmentation_parser.tokenize_regex(variable[:value])
|
81
|
-
# note: variables can be redefined (add replaces if key already exists)
|
82
|
-
table.add(id, resolve_symbols(tokens, table))
|
83
|
-
end
|
84
|
-
table
|
85
|
-
end
|
86
|
-
|
87
|
-
def resolve_symbols(tokens, symbol_table)
|
88
|
-
tokens.inject([]) do |ret, token|
|
89
|
-
if token.type == :variable
|
90
|
-
ret += symbol_table.fetch(token.value)
|
91
|
-
else
|
92
|
-
ret << token
|
93
|
-
end
|
94
|
-
ret
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def rules_for(boundary_data, symbol_table)
|
99
|
-
boundary_data[:rules].map do |rule|
|
100
|
-
r = parse(rule[:value], symbol_table)
|
101
|
-
r.string = rule[:value]
|
102
|
-
r.id = rule[:id]
|
103
|
-
r
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def parse(text, symbol_table)
|
108
|
-
segmentation_parser.parse(
|
109
|
-
text, { symbol_table: symbol_table }
|
110
|
-
)
|
111
|
-
end
|
112
|
-
|
113
|
-
def resource_for(boundary_name)
|
114
|
-
root_resource[:segments][boundary_name.to_sym]
|
115
|
-
end
|
116
|
-
|
117
|
-
def segmentation_parser
|
118
|
-
@segmentation_parser ||= Segmentation::Parser.new
|
119
|
-
end
|
120
|
-
|
121
|
-
def root_resource
|
122
|
-
@root_resource ||= TwitterCldr.get_resource(
|
123
|
-
'shared', 'segments', 'segments_root'
|
124
|
-
)
|
125
|
-
end
|
126
|
-
|
127
|
-
def rule_cache
|
128
|
-
@rule_cache ||= {}
|
129
|
-
end
|
130
|
-
|
131
|
-
def exceptions_resource_cache
|
132
|
-
@exceptions_resource_cache ||= {}
|
133
|
-
end
|
134
|
-
|
135
|
-
def exceptions_cache
|
136
|
-
@exceptions_cache ||= {}
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
@@ -1,869 +0,0 @@
|
|
1
|
-
---
|
2
|
-
:segments:
|
3
|
-
:GraphemeClusterBreak:
|
4
|
-
:rules:
|
5
|
-
-
|
6
|
-
:id: 3
|
7
|
-
:value: " $CR × $LF "
|
8
|
-
-
|
9
|
-
:id: 4
|
10
|
-
:value: " ( $Control | $CR | $LF ) ÷ "
|
11
|
-
-
|
12
|
-
:id: 5
|
13
|
-
:value: " ÷ ( $Control | $CR | $LF ) "
|
14
|
-
-
|
15
|
-
:id: 6
|
16
|
-
:value: " $L × ( $L | $V | $LV | $LVT ) "
|
17
|
-
-
|
18
|
-
:id: 7
|
19
|
-
:value: " ( $LV | $V ) × ( $V | $T ) "
|
20
|
-
-
|
21
|
-
:id: 8
|
22
|
-
:value: " ( $LVT | $T) × $T "
|
23
|
-
-
|
24
|
-
:id: 9
|
25
|
-
:value: " × ($Extend | $ZWJ) "
|
26
|
-
-
|
27
|
-
:id: 9.1
|
28
|
-
:value: " × $SpacingMark "
|
29
|
-
-
|
30
|
-
:id: 9.2
|
31
|
-
:value: " $Prepend × "
|
32
|
-
-
|
33
|
-
:id: 9.3
|
34
|
-
:value: " $LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* × $LinkingConsonant "
|
35
|
-
-
|
36
|
-
:id: 11
|
37
|
-
:value: " $ExtPict $Extend* $ZWJ × $ExtPict "
|
38
|
-
-
|
39
|
-
:id: 12
|
40
|
-
:value: " ^ ($RI $RI)* $RI × $RI "
|
41
|
-
-
|
42
|
-
:id: 13
|
43
|
-
:value: " [^$RI] ($RI $RI)* $RI × $RI "
|
44
|
-
:variables:
|
45
|
-
-
|
46
|
-
:id: $CR
|
47
|
-
:value: "\\p{Grapheme_Cluster_Break=CR}"
|
48
|
-
-
|
49
|
-
:id: $LF
|
50
|
-
:value: "\\p{Grapheme_Cluster_Break=LF}"
|
51
|
-
-
|
52
|
-
:id: $Control
|
53
|
-
:value: "\\p{Grapheme_Cluster_Break=Control}"
|
54
|
-
-
|
55
|
-
:id: $Extend
|
56
|
-
:value: "\\p{Grapheme_Cluster_Break=Extend}"
|
57
|
-
-
|
58
|
-
:id: $ZWJ
|
59
|
-
:value: "\\p{Grapheme_Cluster_Break=ZWJ}"
|
60
|
-
-
|
61
|
-
:id: $RI
|
62
|
-
:value: "\\p{Grapheme_Cluster_Break=Regional_Indicator}"
|
63
|
-
-
|
64
|
-
:id: $Prepend
|
65
|
-
:value: "\\p{Grapheme_Cluster_Break=Prepend}"
|
66
|
-
-
|
67
|
-
:id: $SpacingMark
|
68
|
-
:value: "\\p{Grapheme_Cluster_Break=SpacingMark}"
|
69
|
-
-
|
70
|
-
:id: $L
|
71
|
-
:value: "\\p{Grapheme_Cluster_Break=L}"
|
72
|
-
-
|
73
|
-
:id: $V
|
74
|
-
:value: "\\p{Grapheme_Cluster_Break=V}"
|
75
|
-
-
|
76
|
-
:id: $T
|
77
|
-
:value: "\\p{Grapheme_Cluster_Break=T}"
|
78
|
-
-
|
79
|
-
:id: $LV
|
80
|
-
:value: "\\p{Grapheme_Cluster_Break=LV}"
|
81
|
-
-
|
82
|
-
:id: $LVT
|
83
|
-
:value: "\\p{Grapheme_Cluster_Break=LVT}"
|
84
|
-
-
|
85
|
-
:id: $Virama
|
86
|
-
:value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]"
|
87
|
-
-
|
88
|
-
:id: $LinkingConsonant
|
89
|
-
:value: "[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]"
|
90
|
-
-
|
91
|
-
:id: $ExtPict
|
92
|
-
:value: "\\p{Extended_Pictographic}"
|
93
|
-
-
|
94
|
-
:id: $ExtCccZwj
|
95
|
-
:value: "[[$Extend-\\p{ccc=0}] $ZWJ]"
|
96
|
-
:LineBreak:
|
97
|
-
:rules:
|
98
|
-
-
|
99
|
-
:id: 4
|
100
|
-
:value: " $BK ÷ "
|
101
|
-
-
|
102
|
-
:id: 5.01
|
103
|
-
:value: " $CR × $LF "
|
104
|
-
-
|
105
|
-
:id: 5.02
|
106
|
-
:value: " $CR ÷ "
|
107
|
-
-
|
108
|
-
:id: 5.03
|
109
|
-
:value: " $LF ÷ "
|
110
|
-
-
|
111
|
-
:id: 5.04
|
112
|
-
:value: " $NL ÷ "
|
113
|
-
-
|
114
|
-
:id: 6
|
115
|
-
:value: " × ( $BK | $CR | $LF | $NL ) "
|
116
|
-
-
|
117
|
-
:id: 7.01
|
118
|
-
:value: " × $SP "
|
119
|
-
-
|
120
|
-
:id: 7.02
|
121
|
-
:value: " × $ZW "
|
122
|
-
-
|
123
|
-
:id: 8
|
124
|
-
:value: " $ZW $SP* ÷ "
|
125
|
-
-
|
126
|
-
:id: 8.1
|
127
|
-
:value: " $ZWJ_O × "
|
128
|
-
-
|
129
|
-
:id: 9
|
130
|
-
:value: " $Spec2_ × $CM "
|
131
|
-
-
|
132
|
-
:id: 11.01
|
133
|
-
:value: " × $WJ "
|
134
|
-
-
|
135
|
-
:id: 11.02
|
136
|
-
:value: " $WJ × "
|
137
|
-
-
|
138
|
-
:id: 12
|
139
|
-
:value: " $GL × "
|
140
|
-
-
|
141
|
-
:id: 12.1
|
142
|
-
:value: " $Spec3a_ × $GL "
|
143
|
-
-
|
144
|
-
:id: 12.2
|
145
|
-
:value: " $Spec3b_ $CM+ × $GL "
|
146
|
-
-
|
147
|
-
:id: 12.3
|
148
|
-
:value: " ^ $CM+ × $GL "
|
149
|
-
-
|
150
|
-
:id: 13.01
|
151
|
-
:value: " × $EX "
|
152
|
-
-
|
153
|
-
:id: 13.02
|
154
|
-
:value: " $Spec4_ × ($CL | $CP | $IS | $SY) "
|
155
|
-
-
|
156
|
-
:id: 13.03
|
157
|
-
:value: " $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) "
|
158
|
-
-
|
159
|
-
:id: 13.04
|
160
|
-
:value: " ^ $CM+ × ($CL | $CP | $IS | $SY) "
|
161
|
-
-
|
162
|
-
:id: 14
|
163
|
-
:value: " $OP $SP* × "
|
164
|
-
-
|
165
|
-
:id: 15
|
166
|
-
:value: " $QU $SP* × $OP "
|
167
|
-
-
|
168
|
-
:id: 16
|
169
|
-
:value: " ($CL | $CP) $SP* × $NS "
|
170
|
-
-
|
171
|
-
:id: 17
|
172
|
-
:value: " $B2 $SP* × $B2 "
|
173
|
-
-
|
174
|
-
:id: 18
|
175
|
-
:value: " $SP ÷ "
|
176
|
-
-
|
177
|
-
:id: 19.01
|
178
|
-
:value: " × $QU "
|
179
|
-
-
|
180
|
-
:id: 19.02
|
181
|
-
:value: " $QU × "
|
182
|
-
-
|
183
|
-
:id: 20.01
|
184
|
-
:value: " ÷ $CB "
|
185
|
-
-
|
186
|
-
:id: 20.02
|
187
|
-
:value: " $CB ÷ "
|
188
|
-
-
|
189
|
-
:id: 20.09
|
190
|
-
:value: " $Spec5_ $HY × $AL "
|
191
|
-
-
|
192
|
-
:id: 21.01
|
193
|
-
:value: " × $BA "
|
194
|
-
-
|
195
|
-
:id: 21.02
|
196
|
-
:value: " × $HY "
|
197
|
-
-
|
198
|
-
:id: 21.03
|
199
|
-
:value: " × $NS "
|
200
|
-
-
|
201
|
-
:id: 21.04
|
202
|
-
:value: " $BB × "
|
203
|
-
-
|
204
|
-
:id: 21.1
|
205
|
-
:value: " $HL ($HY | $BA) × "
|
206
|
-
-
|
207
|
-
:id: 21.2
|
208
|
-
:value: " $SY × $HL "
|
209
|
-
-
|
210
|
-
:id: 22.01
|
211
|
-
:value: " ($AL | $HL) × $IN "
|
212
|
-
-
|
213
|
-
:id: 22.02
|
214
|
-
:value: " $EX × $IN "
|
215
|
-
-
|
216
|
-
:id: 22.03
|
217
|
-
:value: " ($ID | $EB | $EM) × $IN "
|
218
|
-
-
|
219
|
-
:id: 22.04
|
220
|
-
:value: " $IN × $IN "
|
221
|
-
-
|
222
|
-
:id: 22.05
|
223
|
-
:value: " $NU × $IN "
|
224
|
-
-
|
225
|
-
:id: 23.02
|
226
|
-
:value: " ($AL | $HL) × $NU "
|
227
|
-
-
|
228
|
-
:id: 23.03
|
229
|
-
:value: " $NU × ($AL | $HL) "
|
230
|
-
-
|
231
|
-
:id: 23.12
|
232
|
-
:value: " $PR × ($ID | $EB | $EM) "
|
233
|
-
-
|
234
|
-
:id: 23.13
|
235
|
-
:value: " ($ID | $EB | $EM) × $PO "
|
236
|
-
-
|
237
|
-
:id: 24.02
|
238
|
-
:value: " ($PR | $PO) × ($AL | $HL) "
|
239
|
-
-
|
240
|
-
:id: 24.03
|
241
|
-
:value: " ($AL | $HL) × ($PR | $PO) "
|
242
|
-
-
|
243
|
-
:id: 25.01
|
244
|
-
:value: " ($PR | $PO) × ( $OP | $HY )? $NU "
|
245
|
-
-
|
246
|
-
:id: 25.02
|
247
|
-
:value: " ( $OP | $HY ) × $NU "
|
248
|
-
-
|
249
|
-
:id: 25.03
|
250
|
-
:value: " $NU × ($NU | $SY | $IS) "
|
251
|
-
-
|
252
|
-
:id: 25.04
|
253
|
-
:value: " $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) "
|
254
|
-
-
|
255
|
-
:id: 25.05
|
256
|
-
:value: " $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) "
|
257
|
-
-
|
258
|
-
:id: 26.01
|
259
|
-
:value: " $JL × $JL | $JV | $H2 | $H3 "
|
260
|
-
-
|
261
|
-
:id: 26.02
|
262
|
-
:value: " $JV | $H2 × $JV | $JT "
|
263
|
-
-
|
264
|
-
:id: 26.03
|
265
|
-
:value: " $JT | $H3 × $JT "
|
266
|
-
-
|
267
|
-
:id: 27.01
|
268
|
-
:value: " $JL | $JV | $JT | $H2 | $H3 × $IN "
|
269
|
-
-
|
270
|
-
:id: 27.02
|
271
|
-
:value: " $JL | $JV | $JT | $H2 | $H3 × $PO "
|
272
|
-
-
|
273
|
-
:id: 27.03
|
274
|
-
:value: " $PR × $JL | $JV | $JT | $H2 | $H3 "
|
275
|
-
-
|
276
|
-
:id: 28
|
277
|
-
:value: " ($AL | $HL) × ($AL | $HL) "
|
278
|
-
-
|
279
|
-
:id: 29
|
280
|
-
:value: " $IS × ($AL | $HL) "
|
281
|
-
-
|
282
|
-
:id: 30.01
|
283
|
-
:value: " ($AL | $HL | $NU) × $OP "
|
284
|
-
-
|
285
|
-
:id: 30.02
|
286
|
-
:value: " $CP × ($AL | $HL | $NU) "
|
287
|
-
-
|
288
|
-
:id: 30.11
|
289
|
-
:value: " ^ ($RI $RI)* $RI × $RI "
|
290
|
-
-
|
291
|
-
:id: 30.12
|
292
|
-
:value: " [^$RI] ($RI $RI)* $RI × $RI "
|
293
|
-
-
|
294
|
-
:id: 30.13
|
295
|
-
:value: " $RI ÷ $RI "
|
296
|
-
-
|
297
|
-
:id: 30.2
|
298
|
-
:value: " $EB × $EM "
|
299
|
-
:variables:
|
300
|
-
-
|
301
|
-
:id: $AI
|
302
|
-
:value: "\\p{Line_Break=Ambiguous}"
|
303
|
-
-
|
304
|
-
:id: $AL
|
305
|
-
:value: "\\p{Line_Break=Alphabetic}"
|
306
|
-
-
|
307
|
-
:id: $B2
|
308
|
-
:value: "\\p{Line_Break=Break_Both}"
|
309
|
-
-
|
310
|
-
:id: $BA
|
311
|
-
:value: "\\p{Line_Break=Break_After}"
|
312
|
-
-
|
313
|
-
:id: $BB
|
314
|
-
:value: "\\p{Line_Break=Break_Before}"
|
315
|
-
-
|
316
|
-
:id: $BK
|
317
|
-
:value: "\\p{Line_Break=Mandatory_Break}"
|
318
|
-
-
|
319
|
-
:id: $CB
|
320
|
-
:value: "\\p{Line_Break=Contingent_Break}"
|
321
|
-
-
|
322
|
-
:id: $CL
|
323
|
-
:value: "\\p{Line_Break=Close_Punctuation}"
|
324
|
-
-
|
325
|
-
:id: $CP
|
326
|
-
:value: "\\p{Line_Break=CP}"
|
327
|
-
-
|
328
|
-
:id: $CM1
|
329
|
-
:value: "\\p{Line_Break=Combining_Mark}"
|
330
|
-
-
|
331
|
-
:id: $CR
|
332
|
-
:value: "\\p{Line_Break=Carriage_Return}"
|
333
|
-
-
|
334
|
-
:id: $EX
|
335
|
-
:value: "\\p{Line_Break=Exclamation}"
|
336
|
-
-
|
337
|
-
:id: $GL
|
338
|
-
:value: "\\p{Line_Break=Glue}"
|
339
|
-
-
|
340
|
-
:id: $H2
|
341
|
-
:value: "\\p{Line_Break=H2}"
|
342
|
-
-
|
343
|
-
:id: $H3
|
344
|
-
:value: "\\p{Line_Break=H3}"
|
345
|
-
-
|
346
|
-
:id: $HL
|
347
|
-
:value: "\\p{Line_Break=HL}"
|
348
|
-
-
|
349
|
-
:id: $HY
|
350
|
-
:value: "\\p{Line_Break=Hyphen}"
|
351
|
-
-
|
352
|
-
:id: $ID
|
353
|
-
:value: "\\p{Line_Break=Ideographic}"
|
354
|
-
-
|
355
|
-
:id: $IN
|
356
|
-
:value: "\\p{Line_Break=Inseparable}"
|
357
|
-
-
|
358
|
-
:id: $IS
|
359
|
-
:value: "\\p{Line_Break=Infix_Numeric}"
|
360
|
-
-
|
361
|
-
:id: $JL
|
362
|
-
:value: "\\p{Line_Break=JL}"
|
363
|
-
-
|
364
|
-
:id: $JT
|
365
|
-
:value: "\\p{Line_Break=JT}"
|
366
|
-
-
|
367
|
-
:id: $JV
|
368
|
-
:value: "\\p{Line_Break=JV}"
|
369
|
-
-
|
370
|
-
:id: $LF
|
371
|
-
:value: "\\p{Line_Break=Line_Feed}"
|
372
|
-
-
|
373
|
-
:id: $NL
|
374
|
-
:value: "\\p{Line_Break=Next_Line}"
|
375
|
-
-
|
376
|
-
:id: $NS
|
377
|
-
:value: "\\p{Line_Break=Nonstarter}"
|
378
|
-
-
|
379
|
-
:id: $NU
|
380
|
-
:value: "\\p{Line_Break=Numeric}"
|
381
|
-
-
|
382
|
-
:id: $OP
|
383
|
-
:value: "\\p{Line_Break=Open_Punctuation}"
|
384
|
-
-
|
385
|
-
:id: $PO
|
386
|
-
:value: "\\p{Line_Break=Postfix_Numeric}"
|
387
|
-
-
|
388
|
-
:id: $PR
|
389
|
-
:value: "\\p{Line_Break=Prefix_Numeric}"
|
390
|
-
-
|
391
|
-
:id: $QU
|
392
|
-
:value: "\\p{Line_Break=Quotation}"
|
393
|
-
-
|
394
|
-
:id: $SA
|
395
|
-
:value: "\\p{Line_Break=Complex_Context}"
|
396
|
-
-
|
397
|
-
:id: $SG
|
398
|
-
:value: "\\p{Line_Break=Surrogate}"
|
399
|
-
-
|
400
|
-
:id: $SP
|
401
|
-
:value: "\\p{Line_Break=Space}"
|
402
|
-
-
|
403
|
-
:id: $SY
|
404
|
-
:value: "\\p{Line_Break=Break_Symbols}"
|
405
|
-
-
|
406
|
-
:id: $WJ
|
407
|
-
:value: "\\p{Line_Break=Word_Joiner}"
|
408
|
-
-
|
409
|
-
:id: $XX
|
410
|
-
:value: "\\p{Line_Break=Unknown}"
|
411
|
-
-
|
412
|
-
:id: $ZW
|
413
|
-
:value: "\\p{Line_Break=ZWSpace}"
|
414
|
-
-
|
415
|
-
:id: $CJ
|
416
|
-
:value: "\\p{Line_Break=Conditional_Japanese_Starter}"
|
417
|
-
-
|
418
|
-
:id: $RI
|
419
|
-
:value: "\\p{Line_Break=Regional_Indicator}"
|
420
|
-
-
|
421
|
-
:id: $EB
|
422
|
-
:value: "\\p{Line_Break=E_Base}"
|
423
|
-
-
|
424
|
-
:id: $EM
|
425
|
-
:value: "\\p{Line_Break=E_Modifier}"
|
426
|
-
-
|
427
|
-
:id: $ZWJ_O
|
428
|
-
:value: "\\p{Line_Break=ZWJ}"
|
429
|
-
-
|
430
|
-
:id: $ZWJ
|
431
|
-
:value: "\\p{Line_Break=ZWJ}"
|
432
|
-
-
|
433
|
-
:id: $CM
|
434
|
-
:value: "[$CM1 $ZWJ]"
|
435
|
-
-
|
436
|
-
:id: $AL
|
437
|
-
:value: "[$AI $AL $SG $XX $SA]"
|
438
|
-
-
|
439
|
-
:id: $NS
|
440
|
-
:value: "[$NS $CJ]"
|
441
|
-
-
|
442
|
-
:id: $X
|
443
|
-
:value: $CM*
|
444
|
-
-
|
445
|
-
:id: $Spec1_
|
446
|
-
:value: "[$SP $BK $CR $LF $NL $ZW]"
|
447
|
-
-
|
448
|
-
:id: $Spec2_
|
449
|
-
:value: "[^ $SP $BK $CR $LF $NL $ZW]"
|
450
|
-
-
|
451
|
-
:id: $Spec3a_
|
452
|
-
:value: "[^ $SP $BA $HY $CM]"
|
453
|
-
-
|
454
|
-
:id: $Spec3b_
|
455
|
-
:value: "[^ $BA $HY $CM]"
|
456
|
-
-
|
457
|
-
:id: $Spec4_
|
458
|
-
:value: "[^ $NU $CM]"
|
459
|
-
-
|
460
|
-
:id: $Spec5_
|
461
|
-
:value: "[$BK $CB $CR $LF $NL $SP $ZW]"
|
462
|
-
-
|
463
|
-
:id: $AI
|
464
|
-
:value: "($AI $X)"
|
465
|
-
-
|
466
|
-
:id: $AL
|
467
|
-
:value: "($AL $X)"
|
468
|
-
-
|
469
|
-
:id: $B2
|
470
|
-
:value: "($B2 $X)"
|
471
|
-
-
|
472
|
-
:id: $BA
|
473
|
-
:value: "($BA $X)"
|
474
|
-
-
|
475
|
-
:id: $BB
|
476
|
-
:value: "($BB $X)"
|
477
|
-
-
|
478
|
-
:id: $CB
|
479
|
-
:value: "($CB $X)"
|
480
|
-
-
|
481
|
-
:id: $CL
|
482
|
-
:value: "($CL $X)"
|
483
|
-
-
|
484
|
-
:id: $CP
|
485
|
-
:value: "($CP $X)"
|
486
|
-
-
|
487
|
-
:id: $CM
|
488
|
-
:value: "($CM $X)"
|
489
|
-
-
|
490
|
-
:id: $EX
|
491
|
-
:value: "($EX $X)"
|
492
|
-
-
|
493
|
-
:id: $GL
|
494
|
-
:value: "($GL $X)"
|
495
|
-
-
|
496
|
-
:id: $H2
|
497
|
-
:value: "($H2 $X)"
|
498
|
-
-
|
499
|
-
:id: $H3
|
500
|
-
:value: "($H3 $X)"
|
501
|
-
-
|
502
|
-
:id: $HL
|
503
|
-
:value: "($HL $X)"
|
504
|
-
-
|
505
|
-
:id: $HY
|
506
|
-
:value: "($HY $X)"
|
507
|
-
-
|
508
|
-
:id: $ID
|
509
|
-
:value: "($ID $X)"
|
510
|
-
-
|
511
|
-
:id: $IN
|
512
|
-
:value: "($IN $X)"
|
513
|
-
-
|
514
|
-
:id: $IS
|
515
|
-
:value: "($IS $X)"
|
516
|
-
-
|
517
|
-
:id: $JL
|
518
|
-
:value: "($JL $X)"
|
519
|
-
-
|
520
|
-
:id: $JT
|
521
|
-
:value: "($JT $X)"
|
522
|
-
-
|
523
|
-
:id: $JV
|
524
|
-
:value: "($JV $X)"
|
525
|
-
-
|
526
|
-
:id: $NS
|
527
|
-
:value: "($NS $X)"
|
528
|
-
-
|
529
|
-
:id: $NU
|
530
|
-
:value: "($NU $X)"
|
531
|
-
-
|
532
|
-
:id: $OP
|
533
|
-
:value: "($OP $X)"
|
534
|
-
-
|
535
|
-
:id: $PO
|
536
|
-
:value: "($PO $X)"
|
537
|
-
-
|
538
|
-
:id: $PR
|
539
|
-
:value: "($PR $X)"
|
540
|
-
-
|
541
|
-
:id: $QU
|
542
|
-
:value: "($QU $X)"
|
543
|
-
-
|
544
|
-
:id: $SA
|
545
|
-
:value: "($SA $X)"
|
546
|
-
-
|
547
|
-
:id: $SG
|
548
|
-
:value: "($SG $X)"
|
549
|
-
-
|
550
|
-
:id: $SY
|
551
|
-
:value: "($SY $X)"
|
552
|
-
-
|
553
|
-
:id: $WJ
|
554
|
-
:value: "($WJ $X)"
|
555
|
-
-
|
556
|
-
:id: $XX
|
557
|
-
:value: "($XX $X)"
|
558
|
-
-
|
559
|
-
:id: $RI
|
560
|
-
:value: "($RI $X)"
|
561
|
-
-
|
562
|
-
:id: $EB
|
563
|
-
:value: "($EB $X)"
|
564
|
-
-
|
565
|
-
:id: $EM
|
566
|
-
:value: "($EM $X)"
|
567
|
-
-
|
568
|
-
:id: $ZWJ
|
569
|
-
:value: "($ZWJ $X)"
|
570
|
-
-
|
571
|
-
:id: $AL
|
572
|
-
:value: "($AL | ^ $CM | (?<=$Spec1_) $CM)"
|
573
|
-
:SentenceBreak:
|
574
|
-
:rules:
|
575
|
-
-
|
576
|
-
:id: 3
|
577
|
-
:value: " $CR × $LF "
|
578
|
-
-
|
579
|
-
:id: 4
|
580
|
-
:value: " $ParaSep ÷ "
|
581
|
-
-
|
582
|
-
:id: 5
|
583
|
-
:value: " × [$Format $Extend] "
|
584
|
-
-
|
585
|
-
:id: 6
|
586
|
-
:value: " $ATerm × $Numeric "
|
587
|
-
-
|
588
|
-
:id: 7
|
589
|
-
:value: " ($Upper | $Lower) $ATerm × $Upper "
|
590
|
-
-
|
591
|
-
:id: 8
|
592
|
-
:value: " $ATerm $Close* $Sp* × $NotPreLower_* $Lower "
|
593
|
-
-
|
594
|
-
:id: 8.1
|
595
|
-
:value: " $SATerm $Close* $Sp* × ($SContinue | $SATerm) "
|
596
|
-
-
|
597
|
-
:id: 9
|
598
|
-
:value: " $SATerm $Close* × ( $Close | $Sp | $ParaSep ) "
|
599
|
-
-
|
600
|
-
:id: 10
|
601
|
-
:value: " $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) "
|
602
|
-
-
|
603
|
-
:id: 11
|
604
|
-
:value: " $SATerm $Close* $Sp* $ParaSep? ÷ "
|
605
|
-
-
|
606
|
-
:id: 998
|
607
|
-
:value: " × $Any "
|
608
|
-
:variables:
|
609
|
-
-
|
610
|
-
:id: $CR
|
611
|
-
:value: "\\p{Sentence_Break=CR}"
|
612
|
-
-
|
613
|
-
:id: $LF
|
614
|
-
:value: "\\p{Sentence_Break=LF}"
|
615
|
-
-
|
616
|
-
:id: $Extend
|
617
|
-
:value: "\\p{Sentence_Break=Extend}"
|
618
|
-
-
|
619
|
-
:id: $Format
|
620
|
-
:value: "\\p{Sentence_Break=Format}"
|
621
|
-
-
|
622
|
-
:id: $Sep
|
623
|
-
:value: "\\p{Sentence_Break=Sep}"
|
624
|
-
-
|
625
|
-
:id: $Sp
|
626
|
-
:value: "\\p{Sentence_Break=Sp}"
|
627
|
-
-
|
628
|
-
:id: $Lower
|
629
|
-
:value: "\\p{Sentence_Break=Lower}"
|
630
|
-
-
|
631
|
-
:id: $Upper
|
632
|
-
:value: "\\p{Sentence_Break=Upper}"
|
633
|
-
-
|
634
|
-
:id: $OLetter
|
635
|
-
:value: "\\p{Sentence_Break=OLetter}"
|
636
|
-
-
|
637
|
-
:id: $Numeric
|
638
|
-
:value: "\\p{Sentence_Break=Numeric}"
|
639
|
-
-
|
640
|
-
:id: $ATerm
|
641
|
-
:value: "\\p{Sentence_Break=ATerm}"
|
642
|
-
-
|
643
|
-
:id: $STerm
|
644
|
-
:value: "\\p{Sentence_Break=STerm}"
|
645
|
-
-
|
646
|
-
:id: $Close
|
647
|
-
:value: "\\p{Sentence_Break=Close}"
|
648
|
-
-
|
649
|
-
:id: $SContinue
|
650
|
-
:value: "\\p{Sentence_Break=SContinue}"
|
651
|
-
-
|
652
|
-
:id: $Any
|
653
|
-
:value: "."
|
654
|
-
-
|
655
|
-
:id: $FE
|
656
|
-
:value: "[$Format $Extend]"
|
657
|
-
-
|
658
|
-
:id: $NotPreLower_
|
659
|
-
:value: "[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]"
|
660
|
-
-
|
661
|
-
:id: $Sp
|
662
|
-
:value: "($Sp $FE*)"
|
663
|
-
-
|
664
|
-
:id: $Lower
|
665
|
-
:value: "($Lower $FE*)"
|
666
|
-
-
|
667
|
-
:id: $Upper
|
668
|
-
:value: "($Upper $FE*)"
|
669
|
-
-
|
670
|
-
:id: $OLetter
|
671
|
-
:value: "($OLetter $FE*)"
|
672
|
-
-
|
673
|
-
:id: $Numeric
|
674
|
-
:value: "($Numeric $FE*)"
|
675
|
-
-
|
676
|
-
:id: $ATerm
|
677
|
-
:value: "($ATerm $FE*)"
|
678
|
-
-
|
679
|
-
:id: $STerm
|
680
|
-
:value: "($STerm $FE*)"
|
681
|
-
-
|
682
|
-
:id: $Close
|
683
|
-
:value: "($Close $FE*)"
|
684
|
-
-
|
685
|
-
:id: $SContinue
|
686
|
-
:value: "($SContinue $FE*)"
|
687
|
-
-
|
688
|
-
:id: $ParaSep
|
689
|
-
:value: "($Sep | $CR | $LF)"
|
690
|
-
-
|
691
|
-
:id: $SATerm
|
692
|
-
:value: "($STerm | $ATerm)"
|
693
|
-
:WordBreak:
|
694
|
-
:rules:
|
695
|
-
-
|
696
|
-
:id: 3
|
697
|
-
:value: " $CR × $LF "
|
698
|
-
-
|
699
|
-
:id: 3.1
|
700
|
-
:value: " ($Newline | $CR | $LF) ÷ "
|
701
|
-
-
|
702
|
-
:id: 3.2
|
703
|
-
:value: " ÷ ($Newline | $CR | $LF) "
|
704
|
-
-
|
705
|
-
:id: 3.3
|
706
|
-
:value: " $ZWJ × $ExtPict "
|
707
|
-
-
|
708
|
-
:id: 3.4
|
709
|
-
:value: " $WSegSpace × $WSegSpace "
|
710
|
-
-
|
711
|
-
:id: 4
|
712
|
-
:value: " $NotBreak_ × [$Format $Extend $ZWJ] "
|
713
|
-
-
|
714
|
-
:id: 5
|
715
|
-
:value: " $AHLetter × $AHLetter "
|
716
|
-
-
|
717
|
-
:id: 6
|
718
|
-
:value: " $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter "
|
719
|
-
-
|
720
|
-
:id: 7
|
721
|
-
:value: " $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter "
|
722
|
-
-
|
723
|
-
:id: 7.1
|
724
|
-
:value: " $Hebrew_Letter × $Single_Quote "
|
725
|
-
-
|
726
|
-
:id: 7.2
|
727
|
-
:value: " $Hebrew_Letter × $Double_Quote $Hebrew_Letter "
|
728
|
-
-
|
729
|
-
:id: 7.3
|
730
|
-
:value: " $Hebrew_Letter $Double_Quote × $Hebrew_Letter "
|
731
|
-
-
|
732
|
-
:id: 8
|
733
|
-
:value: " $Numeric × $Numeric "
|
734
|
-
-
|
735
|
-
:id: 9
|
736
|
-
:value: " $AHLetter × $Numeric "
|
737
|
-
-
|
738
|
-
:id: 10
|
739
|
-
:value: " $Numeric × $AHLetter "
|
740
|
-
-
|
741
|
-
:id: 11
|
742
|
-
:value: " $Numeric ($MidNum | $MidNumLetQ) × $Numeric "
|
743
|
-
-
|
744
|
-
:id: 12
|
745
|
-
:value: " $Numeric × ($MidNum | $MidNumLetQ) $Numeric "
|
746
|
-
-
|
747
|
-
:id: 13
|
748
|
-
:value: " $Katakana × $Katakana "
|
749
|
-
-
|
750
|
-
:id: 13.1
|
751
|
-
:value: " ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet "
|
752
|
-
-
|
753
|
-
:id: 13.2
|
754
|
-
:value: " $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) "
|
755
|
-
-
|
756
|
-
:id: 15
|
757
|
-
:value: " ^ ($RI $RI)* $RI × $RI "
|
758
|
-
-
|
759
|
-
:id: 16
|
760
|
-
:value: " [^$RI] ($RI $RI)* $RI × $RI "
|
761
|
-
:variables:
|
762
|
-
-
|
763
|
-
:id: $CR
|
764
|
-
:value: "\\p{Word_Break=CR}"
|
765
|
-
-
|
766
|
-
:id: $LF
|
767
|
-
:value: "\\p{Word_Break=LF}"
|
768
|
-
-
|
769
|
-
:id: $Newline
|
770
|
-
:value: "\\p{Word_Break=Newline}"
|
771
|
-
-
|
772
|
-
:id: $Extend
|
773
|
-
:value: "\\p{Word_Break=Extend}"
|
774
|
-
-
|
775
|
-
:id: $Format
|
776
|
-
:value: "\\p{Word_Break=Format}"
|
777
|
-
-
|
778
|
-
:id: $Katakana
|
779
|
-
:value: "\\p{Word_Break=Katakana}"
|
780
|
-
-
|
781
|
-
:id: $ALetter
|
782
|
-
:value: "\\p{Word_Break=ALetter}"
|
783
|
-
-
|
784
|
-
:id: $MidLetter
|
785
|
-
:value: "\\p{Word_Break=MidLetter}"
|
786
|
-
-
|
787
|
-
:id: $MidNum
|
788
|
-
:value: "\\p{Word_Break=MidNum}"
|
789
|
-
-
|
790
|
-
:id: $MidNumLet
|
791
|
-
:value: "\\p{Word_Break=MidNumLet}"
|
792
|
-
-
|
793
|
-
:id: $Numeric
|
794
|
-
:value: "\\p{Word_Break=Numeric}"
|
795
|
-
-
|
796
|
-
:id: $ExtendNumLet
|
797
|
-
:value: "\\p{Word_Break=ExtendNumLet}"
|
798
|
-
-
|
799
|
-
:id: $RI
|
800
|
-
:value: "\\p{Word_Break=Regional_Indicator}"
|
801
|
-
-
|
802
|
-
:id: $Hebrew_Letter
|
803
|
-
:value: "\\p{Word_Break=Hebrew_Letter}"
|
804
|
-
-
|
805
|
-
:id: $Double_Quote
|
806
|
-
:value: "\\p{Word_Break=Double_Quote}"
|
807
|
-
-
|
808
|
-
:id: $Single_Quote
|
809
|
-
:value: "\\p{Word_Break=Single_Quote}"
|
810
|
-
-
|
811
|
-
:id: $ZWJ
|
812
|
-
:value: "\\p{Word_Break=ZWJ}"
|
813
|
-
-
|
814
|
-
:id: $ExtPict
|
815
|
-
:value: "\\p{Extended_Pictographic}"
|
816
|
-
-
|
817
|
-
:id: $WSegSpace
|
818
|
-
:value: "\\p{Word_Break=WSegSpace}"
|
819
|
-
-
|
820
|
-
:id: $AHLetter
|
821
|
-
:value: "($ALetter | $Hebrew_Letter)"
|
822
|
-
-
|
823
|
-
:id: $MidNumLetQ
|
824
|
-
:value: "($MidNumLet | $Single_Quote)"
|
825
|
-
-
|
826
|
-
:id: $FE
|
827
|
-
:value: "[$Format $Extend $ZWJ]"
|
828
|
-
-
|
829
|
-
:id: $NotBreak_
|
830
|
-
:value: "[^ $Newline $CR $LF ]"
|
831
|
-
-
|
832
|
-
:id: $Katakana
|
833
|
-
:value: "($Katakana $FE*)"
|
834
|
-
-
|
835
|
-
:id: $ALetter
|
836
|
-
:value: "($ALetter $FE*)"
|
837
|
-
-
|
838
|
-
:id: $MidLetter
|
839
|
-
:value: "($MidLetter $FE*)"
|
840
|
-
-
|
841
|
-
:id: $MidNum
|
842
|
-
:value: "($MidNum $FE*)"
|
843
|
-
-
|
844
|
-
:id: $MidNumLet
|
845
|
-
:value: "($MidNumLet $FE*)"
|
846
|
-
-
|
847
|
-
:id: $Numeric
|
848
|
-
:value: "($Numeric $FE*)"
|
849
|
-
-
|
850
|
-
:id: $ExtendNumLet
|
851
|
-
:value: "($ExtendNumLet $FE*)"
|
852
|
-
-
|
853
|
-
:id: $RI
|
854
|
-
:value: "($RI $FE*)"
|
855
|
-
-
|
856
|
-
:id: $Hebrew_Letter
|
857
|
-
:value: "($Hebrew_Letter $FE*)"
|
858
|
-
-
|
859
|
-
:id: $Double_Quote
|
860
|
-
:value: "($Double_Quote $FE*)"
|
861
|
-
-
|
862
|
-
:id: $Single_Quote
|
863
|
-
:value: "($Single_Quote $FE*)"
|
864
|
-
-
|
865
|
-
:id: $AHLetter
|
866
|
-
:value: "($AHLetter $FE*)"
|
867
|
-
-
|
868
|
-
:id: $MidNumLetQ
|
869
|
-
:value: "($MidNumLetQ $FE*)"
|