annex_29 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ <%
2
+ require("pathname")
3
+
4
+ property_regex = %r{
5
+ ^
6
+ (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
7
+ \s*
8
+ ;
9
+ \s*
10
+ (?<category>[^\#]+)
11
+ }x
12
+
13
+ general_category_regex = %r{
14
+ ^
15
+ (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
16
+ \s*
17
+ ;
18
+ \s*
19
+ (?:[^\#]+)
20
+ \#.
21
+ (?<category>[A-Z][a-z])
22
+ }x
23
+
24
+ parse_unicode_data = ->(file_name, regex = property_regex) do
25
+ data = Hash.new { |hash, key| hash[key] = [] }
26
+ File.open(Pathname.new("data").join(file_name)).each_line do |line|
27
+ next unless match = line.match(regex)
28
+ lower_bound = match["lower_bound".freeze]
29
+ category = match["category".freeze].strip.downcase.gsub(" ", "_").intern
30
+ if upper_bound = match["upper_bound".freeze]
31
+ data[category] << "0x#{lower_bound}..0x#{upper_bound}"
32
+ else
33
+ data[category] << "0x#{lower_bound}"
34
+ end
35
+ end
36
+ data
37
+ end
38
+
39
+ block_map = parse_unicode_data.("Blocks.txt")
40
+ line_break_map = parse_unicode_data.("LineBreak.txt")
41
+ script_map = parse_unicode_data.("Scripts.txt")
42
+ general_category_map = parse_unicode_data.("Scripts.txt", general_category_regex)
43
+ word_break_property_map = parse_unicode_data.("WordBreakProperty.txt")
44
+ %>
45
+
46
+ module Annex29
47
+ module WordSegmentation
48
+ %%{
49
+
50
+ machine segmenter;
51
+ alphtype int;
52
+
53
+ action word {
54
+ words << data[ts...te].pack("U*")
55
+ }
56
+
57
+ nd = (<%= general_category_map[:nd].join("|") %>);
58
+
59
+ block_half_and_full_forms = (<%= block_map[:halfwidth_and_fullwidth_forms].join("|") %>);
60
+
61
+ lb_complex_context = (<%= line_break_map[:sa].join("|") %>);
62
+
63
+ script_han = (<%= script_map[:han].join("|") %>);
64
+
65
+ script_hangul = (<%= script_map[:hangul].join("|") %>);
66
+
67
+ script_hiragana = (<%= script_map[:hiragana].join("|") %>);
68
+
69
+ <% word_break_property_map.each do |category, code_point_range| %>
70
+ <%= "wb_#{category}" %> = (<%= code_point_range.join("|") %>);
71
+ <% end %>
72
+
73
+ sticky = (wb_format | wb_extend | wb_zwj)*;
74
+
75
+ complex_context_ex = lb_complex_context sticky;
76
+
77
+ double_quote_ex = wb_double_quote sticky;
78
+
79
+ e_base_ex = wb_e_base sticky;
80
+
81
+ e_base_gaz_ex = wb_e_base_gaz sticky;
82
+
83
+ e_modifier_ex = wb_e_modifier sticky;
84
+
85
+ extend_num_let_ex = wb_extendnumlet sticky;
86
+
87
+ han_ex = script_han sticky;
88
+
89
+ hangul_ex = (script_hangul & (wb_aletter | wb_hebrew_letter)) sticky;
90
+
91
+ hebrew_letter_ex = wb_hebrew_letter sticky;
92
+
93
+ hebrew_or_aletter_ex = (wb_aletter | wb_hebrew_letter) sticky;
94
+
95
+ hiragana_ex = script_hiragana sticky;
96
+
97
+ katakana_ex = wb_katakana sticky;
98
+
99
+ mid_letter_ex = (wb_midletter | wb_midnumlet | wb_single_quote) sticky;
100
+
101
+ mid_numeric_ex = (wb_midnum | wb_midnumlet | wb_single_quote) sticky;
102
+
103
+ numeric_ex = (wb_numeric | (block_half_and_full_forms & nd)) sticky;
104
+
105
+ regional_indicator_ex = wb_regional_indicator sticky;
106
+
107
+ single_quote_ex = wb_single_quote sticky;
108
+
109
+ numeric =
110
+ extend_num_let_ex*
111
+ numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*
112
+ extend_num_let_ex*;
113
+
114
+ hangul = hangul_ex+;
115
+
116
+ katakana = katakana_ex+;
117
+
118
+ south_east_asian = complex_context_ex+;
119
+
120
+ ideographic = han_ex;
121
+
122
+ hiragana = hiragana_ex;
123
+
124
+ extend_num_let = extend_num_let_ex+;
125
+
126
+ inner_word =
127
+ (katakana_ex (extend_num_let_ex* katakana_ex)*) |
128
+ (
129
+ (hebrew_letter_ex (single_quote_ex | (double_quote_ex hebrew_letter_ex))) |
130
+ (numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*) |
131
+ (hebrew_or_aletter_ex ((extend_num_let_ex* | mid_letter_ex) hebrew_or_aletter_ex)*)
132
+ )+;
133
+
134
+ word =
135
+ extend_num_let_ex*
136
+ inner_word
137
+ (extend_num_let_ex+ inner_word)*
138
+ extend_num_let_ex*;
139
+
140
+ newline =
141
+ wb_cr wb_lf |
142
+ wb_lf |
143
+ wb_cr |
144
+ wb_newline;
145
+
146
+ flags = regional_indicator_ex regional_indicator_ex;
147
+
148
+ emoji =
149
+ e_base_ex e_modifier_ex? |
150
+ wb_zwj? e_base_gaz_ex e_modifier_ex? |
151
+ wb_zwj wb_glue_after_zwj sticky;
152
+
153
+ word_like =
154
+ numeric |
155
+ hangul |
156
+ katakana |
157
+ word |
158
+ south_east_asian |
159
+ ideographic |
160
+ hiragana |
161
+ extend_num_let |
162
+ flags |
163
+ emoji |
164
+ newline |
165
+ ^(newline) sticky |
166
+ sticky |
167
+ any;
168
+
169
+ main := |*
170
+ word_like => word;
171
+ *|;
172
+
173
+ }%%
174
+
175
+ %% write data;
176
+
177
+ class << self
178
+ def call(input)
179
+ data = input.each_char.map(&:ord)
180
+ eof = data.length
181
+ words = []
182
+
183
+ %% write init;
184
+ %% write exec;
185
+
186
+ words
187
+ end
188
+ end
189
+ end
190
+ end
data/lib/annex_29.rb CHANGED
@@ -1,3 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
1
5
  module Annex29
2
6
  require("annex_29/word_segmentation")
3
7
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: annex_29
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
- - Simon Génier
7
+ - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-18 00:00:00.000000000 Z
11
+ date: 2024-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '11.3'
19
+ version: '13.1'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '11.3'
26
+ version: '13.1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,17 +39,39 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '3.5'
41
41
  description:
42
- email: simon.genier@shopify.com
42
+ email: developers@shopify.com
43
43
  executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
46
46
  files:
47
+ - ".github/dependabot.yml"
48
+ - ".github/workflows/ci.yml"
49
+ - ".github/workflows/cla.yml"
50
+ - ".gitignore"
51
+ - ".rspec"
52
+ - ".ruby-version"
53
+ - CHANGELOG.md
54
+ - Gemfile
55
+ - Gemfile.lock
56
+ - LICENSE.md
57
+ - README.md
58
+ - Rakefile
59
+ - annex_29.gemspec
60
+ - bin/rake
61
+ - bin/rspec
62
+ - data/Blocks.txt
63
+ - data/LineBreak.txt
64
+ - data/Scripts.txt
65
+ - data/WordBreakProperty.txt
66
+ - data/WordBreakTest.txt
47
67
  - lib/annex_29.rb
68
+ - lib/annex_29/version.rb
48
69
  - lib/annex_29/word_segmentation.rb
70
+ - lib/annex_29/word_segmentation.rl.erb
49
71
  homepage: https://github.com/Shopify/annex-29
50
- licenses:
51
- - Apache-2.0
52
- metadata: {}
72
+ licenses: []
73
+ metadata:
74
+ allowed_push_host: https://rubygems.org/
53
75
  post_install_message:
54
76
  rdoc_options: []
55
77
  require_paths:
@@ -58,15 +80,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
58
80
  requirements:
59
81
  - - ">="
60
82
  - !ruby/object:Gem::Version
61
- version: '0'
83
+ version: 3.2.0
62
84
  required_rubygems_version: !ruby/object:Gem::Requirement
63
85
  requirements:
64
86
  - - ">="
65
87
  - !ruby/object:Gem::Version
66
88
  version: '0'
67
89
  requirements: []
68
- rubyforge_project:
69
- rubygems_version: 2.5.1
90
+ rubygems_version: 3.5.13
70
91
  signing_key:
71
92
  specification_version: 4
72
93
  summary: Unicode annex 29 compliant word segmentation