character_set 1.1.1 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +11 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +47 -0
- data/README.md +38 -14
- data/Rakefile +60 -36
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -6
- data/ext/character_set/character_set.c +963 -414
- data/ext/character_set/unicode_casefold_table.h +10 -2
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +25 -27
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/shared_methods.rb +60 -49
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +88 -22
- data/.travis.yml +0 -11
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
data/lib/character_set/writer.rb
CHANGED
@@ -1,37 +1,108 @@
|
|
1
1
|
class CharacterSet
|
2
2
|
module Writer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
class << self
|
4
|
+
def write(codepoint_ranges, opts = {}, &block)
|
5
|
+
content = codepoint_ranges.map do |range|
|
6
|
+
if range.size > 2 && opts[:abbreviate] != false
|
7
|
+
bounds = [range.min, range.max]
|
8
|
+
bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
|
9
|
+
else
|
10
|
+
range.map { |cp| write_codepoint(cp, opts, &block) }.join
|
11
|
+
end
|
12
|
+
end.join
|
13
|
+
opts[:in_brackets] ? "[#{content}]" : content
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_codepoint(codepoint, opts = {}, &block)
|
17
|
+
Character.new(codepoint).escape(opts, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_surrogate_ranges(bmp_ranges, astral_ranges)
|
21
|
+
astral_branches = surrogate_range_expressions(astral_ranges)
|
22
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_surrogate_alternation(bmp_ranges, astral_ranges)
|
26
|
+
astral_branches = surrogate_pairs(astral_ranges)
|
27
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def surrogate_range_expressions(astral_ranges)
|
33
|
+
compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
|
34
|
+
[hi_ranges, lo_ranges].map do |ranges|
|
35
|
+
use_brackets = ranges.size > 1 || ranges.first.size > 1
|
36
|
+
write(ranges, format: :js, in_brackets: use_brackets)
|
37
|
+
end.join
|
11
38
|
end
|
12
|
-
end
|
13
|
-
|
14
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def compressed_surrogate_range_pairs(astral_ranges)
|
42
|
+
halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
|
15
43
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
44
|
+
# compress high surrogate codepoint ranges with common low range half
|
45
|
+
with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
|
46
|
+
hi_ranges = pairs.map(&:first)
|
47
|
+
compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
|
48
|
+
prev = arr.last
|
49
|
+
if prev.nil? || prev.max + 1 < range.min # first or gap
|
50
|
+
arr << range
|
51
|
+
else # continuous codepoints, expand previous range
|
52
|
+
arr[-1] = (prev.min)..(range.max)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
[compressed_hi_ranges, lo_range]
|
56
|
+
end
|
57
|
+
|
58
|
+
# compress low surrogate codepoint ranges with common high ranges
|
59
|
+
with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
|
60
|
+
(hash[hi_ranges] ||= []) << lo_range
|
61
|
+
end
|
23
62
|
end
|
24
|
-
end
|
25
63
|
|
26
|
-
|
27
|
-
|
28
|
-
|
64
|
+
def surrogate_half_ranges(astral_range)
|
65
|
+
hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
|
66
|
+
hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
|
67
|
+
hi_count = 1 + hi_max - hi_min
|
68
|
+
return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
|
69
|
+
|
70
|
+
ranges = []
|
71
|
+
|
72
|
+
# first high surrogate might be partially covered (if lo_min > 0xDC00)
|
73
|
+
ranges << [hi_min..hi_min, lo_min..0xDFFF]
|
74
|
+
|
75
|
+
# any high surrogates in between are fully covered
|
76
|
+
ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
|
29
77
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
78
|
+
# last high surrogate might be partially covered (if lo_max < 0xDFFF)
|
79
|
+
ranges << [hi_max..hi_max, 0xDC00..lo_max]
|
80
|
+
|
81
|
+
ranges
|
82
|
+
end
|
83
|
+
|
84
|
+
def surrogate_pair_codepoints(astral_codepoint)
|
85
|
+
base = astral_codepoint - 0x10000
|
86
|
+
high = base / 1024 + 0xD800
|
87
|
+
low = base % 1024 + 0xDC00
|
88
|
+
[high, low]
|
89
|
+
end
|
90
|
+
|
91
|
+
def bmp_set_with_alternatives(bmp_ranges, alternatives)
|
92
|
+
bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
|
93
|
+
return bmp_set if alternatives.empty? && bmp_ranges.any?
|
94
|
+
|
95
|
+
"(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
|
96
|
+
end
|
97
|
+
|
98
|
+
def surrogate_pairs(astral_ranges)
|
99
|
+
astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
|
100
|
+
end
|
101
|
+
|
102
|
+
def surrogate_pair(astral_codepoint)
|
103
|
+
surrogate_pair_codepoints(astral_codepoint)
|
104
|
+
.map { |half| write_codepoint(half, format: :js) }.join
|
105
|
+
end
|
35
106
|
end
|
36
107
|
end
|
37
108
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: character_set
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Janosch Müller
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: sorted_set
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: benchmark-ips
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -25,47 +39,47 @@ dependencies:
|
|
25
39
|
- !ruby/object:Gem::Version
|
26
40
|
version: '2.7'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
42
|
+
name: get_process_mem
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - "~>"
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
47
|
+
version: 0.2.3
|
34
48
|
type: :development
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
54
|
+
version: 0.2.3
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
61
|
+
version: '13.0'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
68
|
+
version: '13.0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake-compiler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
75
|
+
version: '1.1'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
82
|
+
version: '1.1'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: range_compressor
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,28 +100,28 @@ dependencies:
|
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '1.
|
103
|
+
version: '1.6'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '1.
|
110
|
+
version: '1.6'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: regexp_property_values
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0
|
117
|
+
version: '1.0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0
|
124
|
+
version: '1.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,7 +136,35 @@ dependencies:
|
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '3.8'
|
125
|
-
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: codecov
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.2.12
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.12
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: rubocop
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.8'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '1.8'
|
167
|
+
description:
|
126
168
|
email:
|
127
169
|
- janosch84@gmail.com
|
128
170
|
executables: []
|
@@ -130,26 +172,36 @@ extensions:
|
|
130
172
|
- ext/character_set/extconf.rb
|
131
173
|
extra_rdoc_files: []
|
132
174
|
files:
|
175
|
+
- ".gitattributes"
|
176
|
+
- ".github/workflows/lint.yml"
|
177
|
+
- ".github/workflows/tests.yml"
|
133
178
|
- ".gitignore"
|
134
179
|
- ".rspec"
|
135
|
-
- ".
|
180
|
+
- ".rubocop.yml"
|
136
181
|
- BENCHMARK.md
|
137
182
|
- CHANGELOG.md
|
138
183
|
- Gemfile
|
139
184
|
- LICENSE.txt
|
140
185
|
- README.md
|
141
186
|
- Rakefile
|
187
|
+
- benchmarks/count_in.rb
|
142
188
|
- benchmarks/cover.rb
|
143
189
|
- benchmarks/delete_in.rb
|
144
190
|
- benchmarks/keep_in.rb
|
191
|
+
- benchmarks/scan.rb
|
145
192
|
- benchmarks/shared.rb
|
146
193
|
- benchmarks/used_by.rb
|
194
|
+
- benchmarks/z_add.rb
|
195
|
+
- benchmarks/z_delete.rb
|
196
|
+
- benchmarks/z_merge.rb
|
197
|
+
- benchmarks/z_minmax.rb
|
147
198
|
- bin/console
|
148
199
|
- bin/setup
|
149
200
|
- character_set.gemspec
|
150
201
|
- ext/character_set/character_set.c
|
151
202
|
- ext/character_set/extconf.rb
|
152
203
|
- ext/character_set/unicode_casefold_table.h
|
204
|
+
- ext/character_set/unicode_casefold_table.h.tmpl
|
153
205
|
- lib/character_set.rb
|
154
206
|
- lib/character_set/character.rb
|
155
207
|
- lib/character_set/core_ext.rb
|
@@ -158,20 +210,35 @@ files:
|
|
158
210
|
- lib/character_set/expression_converter.rb
|
159
211
|
- lib/character_set/parser.rb
|
160
212
|
- lib/character_set/predefined_sets.rb
|
213
|
+
- lib/character_set/predefined_sets/any.cps
|
214
|
+
- lib/character_set/predefined_sets/ascii.cps
|
215
|
+
- lib/character_set/predefined_sets/ascii_alnum.cps
|
216
|
+
- lib/character_set/predefined_sets/ascii_letter.cps
|
217
|
+
- lib/character_set/predefined_sets/assigned.cps
|
218
|
+
- lib/character_set/predefined_sets/bmp.cps
|
219
|
+
- lib/character_set/predefined_sets/crypt.cps
|
220
|
+
- lib/character_set/predefined_sets/emoji.cps
|
221
|
+
- lib/character_set/predefined_sets/newline.cps
|
222
|
+
- lib/character_set/predefined_sets/surrogate.cps
|
223
|
+
- lib/character_set/predefined_sets/unicode.cps
|
224
|
+
- lib/character_set/predefined_sets/url_fragment.cps
|
225
|
+
- lib/character_set/predefined_sets/url_host.cps
|
226
|
+
- lib/character_set/predefined_sets/url_path.cps
|
227
|
+
- lib/character_set/predefined_sets/url_query.cps
|
228
|
+
- lib/character_set/predefined_sets/whitespace.cps
|
161
229
|
- lib/character_set/pure.rb
|
162
230
|
- lib/character_set/ruby_fallback.rb
|
163
231
|
- lib/character_set/ruby_fallback/character_set_methods.rb
|
164
|
-
- lib/character_set/ruby_fallback/plane_methods.rb
|
165
232
|
- lib/character_set/ruby_fallback/set_methods.rb
|
166
233
|
- lib/character_set/set_method_adapters.rb
|
167
234
|
- lib/character_set/shared_methods.rb
|
168
235
|
- lib/character_set/version.rb
|
169
236
|
- lib/character_set/writer.rb
|
170
|
-
homepage: https://github.com/
|
237
|
+
homepage: https://github.com/jaynetics/character_set
|
171
238
|
licenses:
|
172
239
|
- MIT
|
173
240
|
metadata: {}
|
174
|
-
post_install_message:
|
241
|
+
post_install_message:
|
175
242
|
rdoc_options: []
|
176
243
|
require_paths:
|
177
244
|
- lib
|
@@ -186,9 +253,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
186
253
|
- !ruby/object:Gem::Version
|
187
254
|
version: '0'
|
188
255
|
requirements: []
|
189
|
-
|
190
|
-
|
191
|
-
signing_key:
|
256
|
+
rubygems_version: 3.2.3
|
257
|
+
signing_key:
|
192
258
|
specification_version: 4
|
193
259
|
summary: Build, read, write and compare sets of Unicode codepoints.
|
194
260
|
test_files: []
|
data/.travis.yml
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
class CharacterSet
|
2
|
-
module RubyFallback
|
3
|
-
module PlaneMethods
|
4
|
-
def bmp_part
|
5
|
-
dup.keep_if { |cp| cp < 0x10000 }
|
6
|
-
end
|
7
|
-
|
8
|
-
def astral_part
|
9
|
-
dup.keep_if { |cp| cp >= 0x10000 }
|
10
|
-
end
|
11
|
-
|
12
|
-
def planes
|
13
|
-
plane_set = {}
|
14
|
-
plane_size = 0x10000.to_f
|
15
|
-
each do |cp|
|
16
|
-
plane = (cp / plane_size).floor
|
17
|
-
plane_set[plane] = true
|
18
|
-
end
|
19
|
-
plane_set.keys
|
20
|
-
end
|
21
|
-
|
22
|
-
def member_in_plane?(num)
|
23
|
-
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|