annex_29 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,190 @@
1
+ <%
2
+ require("pathname")
3
+
4
+ property_regex = %r{
5
+ ^
6
+ (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
7
+ \s*
8
+ ;
9
+ \s*
10
+ (?<category>[^\#]+)
11
+ }x
12
+
13
+ general_category_regex = %r{
14
+ ^
15
+ (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
16
+ \s*
17
+ ;
18
+ \s*
19
+ (?:[^\#]+)
20
+ \#.
21
+ (?<category>[A-Z][a-z])
22
+ }x
23
+
24
+ parse_unicode_data = ->(file_name, regex = property_regex) do
25
+ data = Hash.new { |hash, key| hash[key] = [] }
26
+ File.open(Pathname.new("data").join(file_name)).each_line do |line|
27
+ next unless match = line.match(regex)
28
+ lower_bound = match["lower_bound".freeze]
29
+ category = match["category".freeze].strip.downcase.gsub(" ", "_").intern
30
+ if upper_bound = match["upper_bound".freeze]
31
+ data[category] << "0x#{lower_bound}..0x#{upper_bound}"
32
+ else
33
+ data[category] << "0x#{lower_bound}"
34
+ end
35
+ end
36
+ data
37
+ end
38
+
39
+ block_map = parse_unicode_data.("Blocks.txt")
40
+ line_break_map = parse_unicode_data.("LineBreak.txt")
41
+ script_map = parse_unicode_data.("Scripts.txt")
42
+ general_category_map = parse_unicode_data.("Scripts.txt", general_category_regex)
43
+ word_break_property_map = parse_unicode_data.("WordBreakProperty.txt")
44
+ %>
45
+
46
+ module Annex29
47
+ module WordSegmentation
48
+ %%{
49
+
50
+ machine segmenter;
51
+ alphtype int;
52
+
53
+ action word {
54
+ words << data[ts...te].pack("U*")
55
+ }
56
+
57
+ nd = (<%= general_category_map[:nd].join("|") %>);
58
+
59
+ block_half_and_full_forms = (<%= block_map[:halfwidth_and_fullwidth_forms].join("|") %>);
60
+
61
+ lb_complex_context = (<%= line_break_map[:sa].join("|") %>);
62
+
63
+ script_han = (<%= script_map[:han].join("|") %>);
64
+
65
+ script_hangul = (<%= script_map[:hangul].join("|") %>);
66
+
67
+ script_hiragana = (<%= script_map[:hiragana].join("|") %>);
68
+
69
+ <% word_break_property_map.each do |category, code_point_range| %>
70
+ <%= "wb_#{category}" %> = (<%= code_point_range.join("|") %>);
71
+ <% end %>
72
+
73
+ sticky = (wb_format | wb_extend | wb_zwj)*;
74
+
75
+ complex_context_ex = lb_complex_context sticky;
76
+
77
+ double_quote_ex = wb_double_quote sticky;
78
+
79
+ e_base_ex = wb_e_base sticky;
80
+
81
+ e_base_gaz_ex = wb_e_base_gaz sticky;
82
+
83
+ e_modifier_ex = wb_e_modifier sticky;
84
+
85
+ extend_num_let_ex = wb_extendnumlet sticky;
86
+
87
+ han_ex = script_han sticky;
88
+
89
+ hangul_ex = (script_hangul & (wb_aletter | wb_hebrew_letter)) sticky;
90
+
91
+ hebrew_letter_ex = wb_hebrew_letter sticky;
92
+
93
+ hebrew_or_aletter_ex = (wb_aletter | wb_hebrew_letter) sticky;
94
+
95
+ hiragana_ex = script_hiragana sticky;
96
+
97
+ katakana_ex = wb_katakana sticky;
98
+
99
+ mid_letter_ex = (wb_midletter | wb_midnumlet | wb_single_quote) sticky;
100
+
101
+ mid_numeric_ex = (wb_midnum | wb_midnumlet | wb_single_quote) sticky;
102
+
103
+ numeric_ex = (wb_numeric | (block_half_and_full_forms & nd)) sticky;
104
+
105
+ regional_indicator_ex = wb_regional_indicator sticky;
106
+
107
+ single_quote_ex = wb_single_quote sticky;
108
+
109
+ numeric =
110
+ extend_num_let_ex*
111
+ numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*
112
+ extend_num_let_ex*;
113
+
114
+ hangul = hangul_ex+;
115
+
116
+ katakana = katakana_ex+;
117
+
118
+ south_east_asian = complex_context_ex+;
119
+
120
+ ideographic = han_ex;
121
+
122
+ hiragana = hiragana_ex;
123
+
124
+ extend_num_let = extend_num_let_ex+;
125
+
126
+ inner_word =
127
+ (katakana_ex (extend_num_let_ex* katakana_ex)*) |
128
+ (
129
+ (hebrew_letter_ex (single_quote_ex | (double_quote_ex hebrew_letter_ex))) |
130
+ (numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*) |
131
+ (hebrew_or_aletter_ex ((extend_num_let_ex* | mid_letter_ex) hebrew_or_aletter_ex)*)
132
+ )+;
133
+
134
+ word =
135
+ extend_num_let_ex*
136
+ inner_word
137
+ (extend_num_let_ex+ inner_word)*
138
+ extend_num_let_ex*;
139
+
140
+ newline =
141
+ wb_cr wb_lf |
142
+ wb_lf |
143
+ wb_cr |
144
+ wb_newline;
145
+
146
+ flags = regional_indicator_ex regional_indicator_ex;
147
+
148
+ emoji =
149
+ e_base_ex e_modifier_ex? |
150
+ wb_zwj? e_base_gaz_ex e_modifier_ex? |
151
+ wb_zwj wb_glue_after_zwj sticky;
152
+
153
+ word_like =
154
+ numeric |
155
+ hangul |
156
+ katakana |
157
+ word |
158
+ south_east_asian |
159
+ ideographic |
160
+ hiragana |
161
+ extend_num_let |
162
+ flags |
163
+ emoji |
164
+ newline |
165
+ ^(newline) sticky |
166
+ sticky |
167
+ any;
168
+
169
+ main := |*
170
+ word_like => word;
171
+ *|;
172
+
173
+ }%%
174
+
175
+ %% write data;
176
+
177
+ class << self
178
+ def call(input)
179
+ data = input.each_char.map(&:ord)
180
+ eof = data.length
181
+ words = []
182
+
183
+ %% write init;
184
+ %% write exec;
185
+
186
+ words
187
+ end
188
+ end
189
+ end
190
+ end
data/lib/annex_29.rb CHANGED
@@ -1,3 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
1
5
  module Annex29
2
6
  require("annex_29/word_segmentation")
3
7
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: annex_29
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
- - Simon Génier
7
+ - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-18 00:00:00.000000000 Z
11
+ date: 2024-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '11.3'
19
+ version: '13.1'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '11.3'
26
+ version: '13.1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,17 +39,39 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '3.5'
41
41
  description:
42
- email: simon.genier@shopify.com
42
+ email: developers@shopify.com
43
43
  executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
46
46
  files:
47
+ - ".github/dependabot.yml"
48
+ - ".github/workflows/ci.yml"
49
+ - ".github/workflows/cla.yml"
50
+ - ".gitignore"
51
+ - ".rspec"
52
+ - ".ruby-version"
53
+ - CHANGELOG.md
54
+ - Gemfile
55
+ - Gemfile.lock
56
+ - LICENSE.md
57
+ - README.md
58
+ - Rakefile
59
+ - annex_29.gemspec
60
+ - bin/rake
61
+ - bin/rspec
62
+ - data/Blocks.txt
63
+ - data/LineBreak.txt
64
+ - data/Scripts.txt
65
+ - data/WordBreakProperty.txt
66
+ - data/WordBreakTest.txt
47
67
  - lib/annex_29.rb
68
+ - lib/annex_29/version.rb
48
69
  - lib/annex_29/word_segmentation.rb
70
+ - lib/annex_29/word_segmentation.rl.erb
49
71
  homepage: https://github.com/Shopify/annex-29
50
- licenses:
51
- - Apache-2.0
52
- metadata: {}
72
+ licenses: []
73
+ metadata:
74
+ allowed_push_host: https://rubygems.org/
53
75
  post_install_message:
54
76
  rdoc_options: []
55
77
  require_paths:
@@ -58,15 +80,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
58
80
  requirements:
59
81
  - - ">="
60
82
  - !ruby/object:Gem::Version
61
- version: '0'
83
+ version: 3.2.0
62
84
  required_rubygems_version: !ruby/object:Gem::Requirement
63
85
  requirements:
64
86
  - - ">="
65
87
  - !ruby/object:Gem::Version
66
88
  version: '0'
67
89
  requirements: []
68
- rubyforge_project:
69
- rubygems_version: 2.5.1
90
+ rubygems_version: 3.5.13
70
91
  signing_key:
71
92
  specification_version: 4
72
93
  summary: Unicode annex 29 compliant word segmentation