annex_29 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +26 -0
- data/.github/workflows/cla.yml +22 -0
- data/.gitignore +2 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +38 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +35 -0
- data/LICENSE.md +64 -0
- data/README.md +3 -0
- data/Rakefile +28 -0
- data/annex_29.gemspec +28 -0
- data/bin/rake +17 -0
- data/bin/rspec +17 -0
- data/data/Blocks.txt +309 -0
- data/data/LineBreak.txt +3269 -0
- data/data/Scripts.txt +2632 -0
- data/data/WordBreakProperty.txt +1298 -0
- data/data/WordBreakTest.txt +2084 -0
- data/lib/annex_29/version.rb +5 -0
- data/lib/annex_29/word_segmentation.rl.erb +190 -0
- data/lib/annex_29.rb +4 -0
- metadata +32 -12
@@ -0,0 +1,190 @@
|
|
1
|
+
<%
|
2
|
+
require("pathname")
|
3
|
+
|
4
|
+
property_regex = %r{
|
5
|
+
^
|
6
|
+
(?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
|
7
|
+
\s*
|
8
|
+
;
|
9
|
+
\s*
|
10
|
+
(?<category>[^\#]+)
|
11
|
+
}x
|
12
|
+
|
13
|
+
general_category_regex = %r{
|
14
|
+
^
|
15
|
+
(?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
|
16
|
+
\s*
|
17
|
+
;
|
18
|
+
\s*
|
19
|
+
(?:[^\#]+)
|
20
|
+
\#.
|
21
|
+
(?<category>[A-Z][a-z])
|
22
|
+
}x
|
23
|
+
|
24
|
+
parse_unicode_data = ->(file_name, regex = property_regex) do
|
25
|
+
data = Hash.new { |hash, key| hash[key] = [] }
|
26
|
+
File.open(Pathname.new("data").join(file_name)).each_line do |line|
|
27
|
+
next unless match = line.match(regex)
|
28
|
+
lower_bound = match["lower_bound".freeze]
|
29
|
+
category = match["category".freeze].strip.downcase.gsub(" ", "_").intern
|
30
|
+
if upper_bound = match["upper_bound".freeze]
|
31
|
+
data[category] << "0x#{lower_bound}..0x#{upper_bound}"
|
32
|
+
else
|
33
|
+
data[category] << "0x#{lower_bound}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
data
|
37
|
+
end
|
38
|
+
|
39
|
+
block_map = parse_unicode_data.("Blocks.txt")
|
40
|
+
line_break_map = parse_unicode_data.("LineBreak.txt")
|
41
|
+
script_map = parse_unicode_data.("Scripts.txt")
|
42
|
+
general_category_map = parse_unicode_data.("Scripts.txt", general_category_regex)
|
43
|
+
word_break_property_map = parse_unicode_data.("WordBreakProperty.txt")
|
44
|
+
%>
|
45
|
+
|
46
|
+
module Annex29
|
47
|
+
module WordSegmentation
|
48
|
+
%%{
|
49
|
+
|
50
|
+
machine segmenter;
|
51
|
+
alphtype int;
|
52
|
+
|
53
|
+
action word {
|
54
|
+
words << data[ts...te].pack("U*")
|
55
|
+
}
|
56
|
+
|
57
|
+
nd = (<%= general_category_map[:nd].join("|") %>);
|
58
|
+
|
59
|
+
block_half_and_full_forms = (<%= block_map[:halfwidth_and_fullwidth_forms].join("|") %>);
|
60
|
+
|
61
|
+
lb_complex_context = (<%= line_break_map[:sa].join("|") %>);
|
62
|
+
|
63
|
+
script_han = (<%= script_map[:han].join("|") %>);
|
64
|
+
|
65
|
+
script_hangul = (<%= script_map[:hangul].join("|") %>);
|
66
|
+
|
67
|
+
script_hiragana = (<%= script_map[:hiragana].join("|") %>);
|
68
|
+
|
69
|
+
<% word_break_property_map.each do |category, code_point_range| %>
|
70
|
+
<%= "wb_#{category}" %> = (<%= code_point_range.join("|") %>);
|
71
|
+
<% end %>
|
72
|
+
|
73
|
+
sticky = (wb_format | wb_extend | wb_zwj)*;
|
74
|
+
|
75
|
+
complex_context_ex = lb_complex_context sticky;
|
76
|
+
|
77
|
+
double_quote_ex = wb_double_quote sticky;
|
78
|
+
|
79
|
+
e_base_ex = wb_e_base sticky;
|
80
|
+
|
81
|
+
e_base_gaz_ex = wb_e_base_gaz sticky;
|
82
|
+
|
83
|
+
e_modifier_ex = wb_e_modifier sticky;
|
84
|
+
|
85
|
+
extend_num_let_ex = wb_extendnumlet sticky;
|
86
|
+
|
87
|
+
han_ex = script_han sticky;
|
88
|
+
|
89
|
+
hangul_ex = (script_hangul & (wb_aletter | wb_hebrew_letter)) sticky;
|
90
|
+
|
91
|
+
hebrew_letter_ex = wb_hebrew_letter sticky;
|
92
|
+
|
93
|
+
hebrew_or_aletter_ex = (wb_aletter | wb_hebrew_letter) sticky;
|
94
|
+
|
95
|
+
hiragana_ex = script_hiragana sticky;
|
96
|
+
|
97
|
+
katakana_ex = wb_katakana sticky;
|
98
|
+
|
99
|
+
mid_letter_ex = (wb_midletter | wb_midnumlet | wb_single_quote) sticky;
|
100
|
+
|
101
|
+
mid_numeric_ex = (wb_midnum | wb_midnumlet | wb_single_quote) sticky;
|
102
|
+
|
103
|
+
numeric_ex = (wb_numeric | (block_half_and_full_forms & nd)) sticky;
|
104
|
+
|
105
|
+
regional_indicator_ex = wb_regional_indicator sticky;
|
106
|
+
|
107
|
+
single_quote_ex = wb_single_quote sticky;
|
108
|
+
|
109
|
+
numeric =
|
110
|
+
extend_num_let_ex*
|
111
|
+
numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*
|
112
|
+
extend_num_let_ex*;
|
113
|
+
|
114
|
+
hangul = hangul_ex+;
|
115
|
+
|
116
|
+
katakana = katakana_ex+;
|
117
|
+
|
118
|
+
south_east_asian = complex_context_ex+;
|
119
|
+
|
120
|
+
ideographic = han_ex;
|
121
|
+
|
122
|
+
hiragana = hiragana_ex;
|
123
|
+
|
124
|
+
extend_num_let = extend_num_let_ex+;
|
125
|
+
|
126
|
+
inner_word =
|
127
|
+
(katakana_ex (extend_num_let_ex* katakana_ex)*) |
|
128
|
+
(
|
129
|
+
(hebrew_letter_ex (single_quote_ex | (double_quote_ex hebrew_letter_ex))) |
|
130
|
+
(numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*) |
|
131
|
+
(hebrew_or_aletter_ex ((extend_num_let_ex* | mid_letter_ex) hebrew_or_aletter_ex)*)
|
132
|
+
)+;
|
133
|
+
|
134
|
+
word =
|
135
|
+
extend_num_let_ex*
|
136
|
+
inner_word
|
137
|
+
(extend_num_let_ex+ inner_word)*
|
138
|
+
extend_num_let_ex*;
|
139
|
+
|
140
|
+
newline =
|
141
|
+
wb_cr wb_lf |
|
142
|
+
wb_lf |
|
143
|
+
wb_cr |
|
144
|
+
wb_newline;
|
145
|
+
|
146
|
+
flags = regional_indicator_ex regional_indicator_ex;
|
147
|
+
|
148
|
+
emoji =
|
149
|
+
e_base_ex e_modifier_ex? |
|
150
|
+
wb_zwj? e_base_gaz_ex e_modifier_ex? |
|
151
|
+
wb_zwj wb_glue_after_zwj sticky;
|
152
|
+
|
153
|
+
word_like =
|
154
|
+
numeric |
|
155
|
+
hangul |
|
156
|
+
katakana |
|
157
|
+
word |
|
158
|
+
south_east_asian |
|
159
|
+
ideographic |
|
160
|
+
hiragana |
|
161
|
+
extend_num_let |
|
162
|
+
flags |
|
163
|
+
emoji |
|
164
|
+
newline |
|
165
|
+
^(newline) sticky |
|
166
|
+
sticky |
|
167
|
+
any;
|
168
|
+
|
169
|
+
main := |*
|
170
|
+
word_like => word;
|
171
|
+
*|;
|
172
|
+
|
173
|
+
}%%
|
174
|
+
|
175
|
+
%% write data;
|
176
|
+
|
177
|
+
class << self
|
178
|
+
def call(input)
|
179
|
+
data = input.each_char.map(&:ord)
|
180
|
+
eof = data.length
|
181
|
+
words = []
|
182
|
+
|
183
|
+
%% write init;
|
184
|
+
%% write exec;
|
185
|
+
|
186
|
+
words
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
data/lib/annex_29.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: annex_29
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Shopify
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '13.1'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '13.1'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,17 +39,38 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '3.5'
|
41
41
|
description:
|
42
|
-
email:
|
42
|
+
email: developers@shopify.com
|
43
43
|
executables: []
|
44
44
|
extensions: []
|
45
45
|
extra_rdoc_files: []
|
46
46
|
files:
|
47
|
+
- ".github/workflows/ci.yml"
|
48
|
+
- ".github/workflows/cla.yml"
|
49
|
+
- ".gitignore"
|
50
|
+
- ".rspec"
|
51
|
+
- ".ruby-version"
|
52
|
+
- CHANGELOG.md
|
53
|
+
- Gemfile
|
54
|
+
- Gemfile.lock
|
55
|
+
- LICENSE.md
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- annex_29.gemspec
|
59
|
+
- bin/rake
|
60
|
+
- bin/rspec
|
61
|
+
- data/Blocks.txt
|
62
|
+
- data/LineBreak.txt
|
63
|
+
- data/Scripts.txt
|
64
|
+
- data/WordBreakProperty.txt
|
65
|
+
- data/WordBreakTest.txt
|
47
66
|
- lib/annex_29.rb
|
67
|
+
- lib/annex_29/version.rb
|
48
68
|
- lib/annex_29/word_segmentation.rb
|
69
|
+
- lib/annex_29/word_segmentation.rl.erb
|
49
70
|
homepage: https://github.com/Shopify/annex-29
|
50
|
-
licenses:
|
51
|
-
|
52
|
-
|
71
|
+
licenses: []
|
72
|
+
metadata:
|
73
|
+
allowed_push_host: https://rubygems.org/
|
53
74
|
post_install_message:
|
54
75
|
rdoc_options: []
|
55
76
|
require_paths:
|
@@ -58,15 +79,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
58
79
|
requirements:
|
59
80
|
- - ">="
|
60
81
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
82
|
+
version: 3.2.0
|
62
83
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
84
|
requirements:
|
64
85
|
- - ">="
|
65
86
|
- !ruby/object:Gem::Version
|
66
87
|
version: '0'
|
67
88
|
requirements: []
|
68
|
-
|
69
|
-
rubygems_version: 2.5.1
|
89
|
+
rubygems_version: 3.4.22
|
70
90
|
signing_key:
|
71
91
|
specification_version: 4
|
72
92
|
summary: Unicode annex 29 compliant word segmentation
|