character_set 1.1.1-java → 1.4.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +11 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +47 -0
- data/README.md +38 -14
- data/Rakefile +60 -36
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -6
- data/ext/character_set/character_set.c +963 -414
- data/ext/character_set/unicode_casefold_table.h +10 -2
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +25 -27
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/shared_methods.rb +60 -49
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +102 -22
- data/.travis.yml +0 -11
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
data/lib/character_set/writer.rb
CHANGED
@@ -1,37 +1,108 @@
|
|
1
1
|
class CharacterSet
|
2
2
|
module Writer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
class << self
|
4
|
+
def write(codepoint_ranges, opts = {}, &block)
|
5
|
+
content = codepoint_ranges.map do |range|
|
6
|
+
if range.size > 2 && opts[:abbreviate] != false
|
7
|
+
bounds = [range.min, range.max]
|
8
|
+
bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
|
9
|
+
else
|
10
|
+
range.map { |cp| write_codepoint(cp, opts, &block) }.join
|
11
|
+
end
|
12
|
+
end.join
|
13
|
+
opts[:in_brackets] ? "[#{content}]" : content
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_codepoint(codepoint, opts = {}, &block)
|
17
|
+
Character.new(codepoint).escape(opts, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_surrogate_ranges(bmp_ranges, astral_ranges)
|
21
|
+
astral_branches = surrogate_range_expressions(astral_ranges)
|
22
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_surrogate_alternation(bmp_ranges, astral_ranges)
|
26
|
+
astral_branches = surrogate_pairs(astral_ranges)
|
27
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def surrogate_range_expressions(astral_ranges)
|
33
|
+
compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
|
34
|
+
[hi_ranges, lo_ranges].map do |ranges|
|
35
|
+
use_brackets = ranges.size > 1 || ranges.first.size > 1
|
36
|
+
write(ranges, format: :js, in_brackets: use_brackets)
|
37
|
+
end.join
|
11
38
|
end
|
12
|
-
end
|
13
|
-
|
14
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def compressed_surrogate_range_pairs(astral_ranges)
|
42
|
+
halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
|
15
43
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
44
|
+
# compress high surrogate codepoint ranges with common low range half
|
45
|
+
with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
|
46
|
+
hi_ranges = pairs.map(&:first)
|
47
|
+
compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
|
48
|
+
prev = arr.last
|
49
|
+
if prev.nil? || prev.max + 1 < range.min # first or gap
|
50
|
+
arr << range
|
51
|
+
else # continuous codepoints, expand previous range
|
52
|
+
arr[-1] = (prev.min)..(range.max)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
[compressed_hi_ranges, lo_range]
|
56
|
+
end
|
57
|
+
|
58
|
+
# compress low surrogate codepoint ranges with common high ranges
|
59
|
+
with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
|
60
|
+
(hash[hi_ranges] ||= []) << lo_range
|
61
|
+
end
|
23
62
|
end
|
24
|
-
end
|
25
63
|
|
26
|
-
|
27
|
-
|
28
|
-
|
64
|
+
def surrogate_half_ranges(astral_range)
|
65
|
+
hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
|
66
|
+
hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
|
67
|
+
hi_count = 1 + hi_max - hi_min
|
68
|
+
return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
|
69
|
+
|
70
|
+
ranges = []
|
71
|
+
|
72
|
+
# first high surrogate might be partially covered (if lo_min > 0xDC00)
|
73
|
+
ranges << [hi_min..hi_min, lo_min..0xDFFF]
|
74
|
+
|
75
|
+
# any high surrogates in between are fully covered
|
76
|
+
ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
|
29
77
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
78
|
+
# last high surrogate might be partially covered (if lo_max < 0xDFFF)
|
79
|
+
ranges << [hi_max..hi_max, 0xDC00..lo_max]
|
80
|
+
|
81
|
+
ranges
|
82
|
+
end
|
83
|
+
|
84
|
+
def surrogate_pair_codepoints(astral_codepoint)
|
85
|
+
base = astral_codepoint - 0x10000
|
86
|
+
high = base / 1024 + 0xD800
|
87
|
+
low = base % 1024 + 0xDC00
|
88
|
+
[high, low]
|
89
|
+
end
|
90
|
+
|
91
|
+
def bmp_set_with_alternatives(bmp_ranges, alternatives)
|
92
|
+
bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
|
93
|
+
return bmp_set if alternatives.empty? && bmp_ranges.any?
|
94
|
+
|
95
|
+
"(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
|
96
|
+
end
|
97
|
+
|
98
|
+
def surrogate_pairs(astral_ranges)
|
99
|
+
astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
|
100
|
+
end
|
101
|
+
|
102
|
+
def surrogate_pair(astral_codepoint)
|
103
|
+
surrogate_pair_codepoints(astral_codepoint)
|
104
|
+
.map { |half| write_codepoint(half, format: :js) }.join
|
105
|
+
end
|
35
106
|
end
|
36
107
|
end
|
37
108
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: character_set
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Janosch Müller
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: sorted_set
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: benchmark-ips
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -25,47 +39,47 @@ dependencies:
|
|
25
39
|
- !ruby/object:Gem::Version
|
26
40
|
version: '2.7'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
42
|
+
name: get_process_mem
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - "~>"
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
47
|
+
version: 0.2.3
|
34
48
|
type: :development
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
54
|
+
version: 0.2.3
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
61
|
+
version: '13.0'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
68
|
+
version: '13.0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake-compiler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
75
|
+
version: '1.1'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
82
|
+
version: '1.1'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: range_compressor
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,28 +100,28 @@ dependencies:
|
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '1.
|
103
|
+
version: '1.6'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '1.
|
110
|
+
version: '1.6'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: regexp_property_values
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0
|
117
|
+
version: '1.0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0
|
124
|
+
version: '1.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,33 +136,85 @@ dependencies:
|
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '3.8'
|
125
|
-
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: codecov
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.2.12
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.12
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: rubocop
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.8'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '1.8'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: range_compressor
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '1.0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '1.0'
|
181
|
+
description:
|
126
182
|
email:
|
127
183
|
- janosch84@gmail.com
|
128
184
|
executables: []
|
129
185
|
extensions: []
|
130
186
|
extra_rdoc_files: []
|
131
187
|
files:
|
188
|
+
- ".gitattributes"
|
189
|
+
- ".github/workflows/lint.yml"
|
190
|
+
- ".github/workflows/tests.yml"
|
132
191
|
- ".gitignore"
|
133
192
|
- ".rspec"
|
134
|
-
- ".
|
193
|
+
- ".rubocop.yml"
|
135
194
|
- BENCHMARK.md
|
136
195
|
- CHANGELOG.md
|
137
196
|
- Gemfile
|
138
197
|
- LICENSE.txt
|
139
198
|
- README.md
|
140
199
|
- Rakefile
|
200
|
+
- benchmarks/count_in.rb
|
141
201
|
- benchmarks/cover.rb
|
142
202
|
- benchmarks/delete_in.rb
|
143
203
|
- benchmarks/keep_in.rb
|
204
|
+
- benchmarks/scan.rb
|
144
205
|
- benchmarks/shared.rb
|
145
206
|
- benchmarks/used_by.rb
|
207
|
+
- benchmarks/z_add.rb
|
208
|
+
- benchmarks/z_delete.rb
|
209
|
+
- benchmarks/z_merge.rb
|
210
|
+
- benchmarks/z_minmax.rb
|
146
211
|
- bin/console
|
147
212
|
- bin/setup
|
148
213
|
- character_set.gemspec
|
149
214
|
- ext/character_set/character_set.c
|
150
215
|
- ext/character_set/extconf.rb
|
151
216
|
- ext/character_set/unicode_casefold_table.h
|
217
|
+
- ext/character_set/unicode_casefold_table.h.tmpl
|
152
218
|
- lib/character_set.rb
|
153
219
|
- lib/character_set/character.rb
|
154
220
|
- lib/character_set/core_ext.rb
|
@@ -157,20 +223,35 @@ files:
|
|
157
223
|
- lib/character_set/expression_converter.rb
|
158
224
|
- lib/character_set/parser.rb
|
159
225
|
- lib/character_set/predefined_sets.rb
|
226
|
+
- lib/character_set/predefined_sets/any.cps
|
227
|
+
- lib/character_set/predefined_sets/ascii.cps
|
228
|
+
- lib/character_set/predefined_sets/ascii_alnum.cps
|
229
|
+
- lib/character_set/predefined_sets/ascii_letter.cps
|
230
|
+
- lib/character_set/predefined_sets/assigned.cps
|
231
|
+
- lib/character_set/predefined_sets/bmp.cps
|
232
|
+
- lib/character_set/predefined_sets/crypt.cps
|
233
|
+
- lib/character_set/predefined_sets/emoji.cps
|
234
|
+
- lib/character_set/predefined_sets/newline.cps
|
235
|
+
- lib/character_set/predefined_sets/surrogate.cps
|
236
|
+
- lib/character_set/predefined_sets/unicode.cps
|
237
|
+
- lib/character_set/predefined_sets/url_fragment.cps
|
238
|
+
- lib/character_set/predefined_sets/url_host.cps
|
239
|
+
- lib/character_set/predefined_sets/url_path.cps
|
240
|
+
- lib/character_set/predefined_sets/url_query.cps
|
241
|
+
- lib/character_set/predefined_sets/whitespace.cps
|
160
242
|
- lib/character_set/pure.rb
|
161
243
|
- lib/character_set/ruby_fallback.rb
|
162
244
|
- lib/character_set/ruby_fallback/character_set_methods.rb
|
163
|
-
- lib/character_set/ruby_fallback/plane_methods.rb
|
164
245
|
- lib/character_set/ruby_fallback/set_methods.rb
|
165
246
|
- lib/character_set/set_method_adapters.rb
|
166
247
|
- lib/character_set/shared_methods.rb
|
167
248
|
- lib/character_set/version.rb
|
168
249
|
- lib/character_set/writer.rb
|
169
|
-
homepage: https://github.com/
|
250
|
+
homepage: https://github.com/jaynetics/character_set
|
170
251
|
licenses:
|
171
252
|
- MIT
|
172
253
|
metadata: {}
|
173
|
-
post_install_message:
|
254
|
+
post_install_message:
|
174
255
|
rdoc_options: []
|
175
256
|
require_paths:
|
176
257
|
- lib
|
@@ -185,9 +266,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
185
266
|
- !ruby/object:Gem::Version
|
186
267
|
version: '0'
|
187
268
|
requirements: []
|
188
|
-
|
189
|
-
|
190
|
-
signing_key:
|
269
|
+
rubygems_version: 3.2.3
|
270
|
+
signing_key:
|
191
271
|
specification_version: 4
|
192
272
|
summary: Build, read, write and compare sets of Unicode codepoints.
|
193
273
|
test_files: []
|
data/.travis.yml
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
class CharacterSet
|
2
|
-
module RubyFallback
|
3
|
-
module PlaneMethods
|
4
|
-
def bmp_part
|
5
|
-
dup.keep_if { |cp| cp < 0x10000 }
|
6
|
-
end
|
7
|
-
|
8
|
-
def astral_part
|
9
|
-
dup.keep_if { |cp| cp >= 0x10000 }
|
10
|
-
end
|
11
|
-
|
12
|
-
def planes
|
13
|
-
plane_set = {}
|
14
|
-
plane_size = 0x10000.to_f
|
15
|
-
each do |cp|
|
16
|
-
plane = (cp / plane_size).floor
|
17
|
-
plane_set[plane] = true
|
18
|
-
end
|
19
|
-
plane_set.keys
|
20
|
-
end
|
21
|
-
|
22
|
-
def member_in_plane?(num)
|
23
|
-
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|