character_set 1.1.1 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.1.1'
2
+ VERSION = '1.4.1'
3
3
  end
@@ -1,37 +1,108 @@
1
1
  class CharacterSet
2
2
  module Writer
3
- module_function
4
-
5
- def write(codepoint_ranges, opts = {}, &block)
6
- content = codepoint_ranges.map do |range|
7
- if range.size > 2 && opts[:abbreviate] != false
8
- range.minmax.map { |cp| Character.new(cp).escape(opts, &block) }.join('-')
9
- else
10
- range.map { |cp| Character.new(cp).escape(opts, &block) }.join
3
+ class << self
4
+ def write(codepoint_ranges, opts = {}, &block)
5
+ content = codepoint_ranges.map do |range|
6
+ if range.size > 2 && opts[:abbreviate] != false
7
+ bounds = [range.min, range.max]
8
+ bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
9
+ else
10
+ range.map { |cp| write_codepoint(cp, opts, &block) }.join
11
+ end
12
+ end.join
13
+ opts[:in_brackets] ? "[#{content}]" : content
14
+ end
15
+
16
+ def write_codepoint(codepoint, opts = {}, &block)
17
+ Character.new(codepoint).escape(opts, &block)
18
+ end
19
+
20
+ def write_surrogate_ranges(bmp_ranges, astral_ranges)
21
+ astral_branches = surrogate_range_expressions(astral_ranges)
22
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
23
+ end
24
+
25
+ def write_surrogate_alternation(bmp_ranges, astral_ranges)
26
+ astral_branches = surrogate_pairs(astral_ranges)
27
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
28
+ end
29
+
30
+ private
31
+
32
+ def surrogate_range_expressions(astral_ranges)
33
+ compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
34
+ [hi_ranges, lo_ranges].map do |ranges|
35
+ use_brackets = ranges.size > 1 || ranges.first.size > 1
36
+ write(ranges, format: :js, in_brackets: use_brackets)
37
+ end.join
11
38
  end
12
- end.join
13
- opts[:in_brackets] ? "[#{content}]" : content
14
- end
39
+ end
40
+
41
+ def compressed_surrogate_range_pairs(astral_ranges)
42
+ halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
15
43
 
16
- def write_surrogate_alternation(bmp_ranges, astral_ranges)
17
- bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
18
- if astral_ranges.empty?
19
- bmp_set
20
- else
21
- surrogate_pairs = surrogate_pairs(astral_ranges)
22
- "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + surrogate_pairs) * '|'})"
44
+ # compress high surrogate codepoint ranges with common low range half
45
+ with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
46
+ hi_ranges = pairs.map(&:first)
47
+ compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
48
+ prev = arr.last
49
+ if prev.nil? || prev.max + 1 < range.min # first or gap
50
+ arr << range
51
+ else # continuous codepoints, expand previous range
52
+ arr[-1] = (prev.min)..(range.max)
53
+ end
54
+ end
55
+ [compressed_hi_ranges, lo_range]
56
+ end
57
+
58
+ # compress low surrogate codepoint ranges with common high ranges
59
+ with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
60
+ (hash[hi_ranges] ||= []) << lo_range
61
+ end
23
62
  end
24
- end
25
63
 
26
- def surrogate_pairs(astral_ranges)
27
- astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
28
- end
64
+ def surrogate_half_ranges(astral_range)
65
+ hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
66
+ hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
67
+ hi_count = 1 + hi_max - hi_min
68
+ return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
69
+
70
+ ranges = []
71
+
72
+ # first high surrogate might be partially covered (if lo_min > 0xDC00)
73
+ ranges << [hi_min..hi_min, lo_min..0xDFFF]
74
+
75
+ # any high surrogates in between are fully covered
76
+ ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
29
77
 
30
- def surrogate_pair(astral_codepoint)
31
- base = astral_codepoint - 0x10000
32
- high = ((base / 1024).floor + 0xD800).to_s(16)
33
- low = (base % 1024 + 0xDC00).to_s(16)
34
- "\\u#{high}\\u#{low}"
78
+ # last high surrogate might be partially covered (if lo_max < 0xDFFF)
79
+ ranges << [hi_max..hi_max, 0xDC00..lo_max]
80
+
81
+ ranges
82
+ end
83
+
84
+ def surrogate_pair_codepoints(astral_codepoint)
85
+ base = astral_codepoint - 0x10000
86
+ high = base / 1024 + 0xD800
87
+ low = base % 1024 + 0xDC00
88
+ [high, low]
89
+ end
90
+
91
+ def bmp_set_with_alternatives(bmp_ranges, alternatives)
92
+ bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
93
+ return bmp_set if alternatives.empty? && bmp_ranges.any?
94
+
95
+ "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
96
+ end
97
+
98
+ def surrogate_pairs(astral_ranges)
99
+ astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
100
+ end
101
+
102
+ def surrogate_pair(astral_codepoint)
103
+ surrogate_pair_codepoints(astral_codepoint)
104
+ .map { |half| write_codepoint(half, format: :js) }.join
105
+ end
35
106
  end
36
107
  end
37
108
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: character_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Janosch Müller
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-24 00:00:00.000000000 Z
11
+ date: 2021-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sorted_set
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: benchmark-ips
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -25,47 +39,47 @@ dependencies:
25
39
  - !ruby/object:Gem::Version
26
40
  version: '2.7'
27
41
  - !ruby/object:Gem::Dependency
28
- name: bundler
42
+ name: get_process_mem
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - "~>"
32
46
  - !ruby/object:Gem::Version
33
- version: '1.16'
47
+ version: 0.2.3
34
48
  type: :development
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
52
  - - "~>"
39
53
  - !ruby/object:Gem::Version
40
- version: '1.16'
54
+ version: 0.2.3
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: '12.0'
61
+ version: '13.0'
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: '12.0'
68
+ version: '13.0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: rake-compiler
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '1.0'
75
+ version: '1.1'
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: '1.0'
82
+ version: '1.1'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: range_compressor
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -86,28 +100,28 @@ dependencies:
86
100
  requirements:
87
101
  - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '1.1'
103
+ version: '1.6'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
108
  - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '1.1'
110
+ version: '1.6'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: regexp_property_values
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: 0.3.4
117
+ version: '1.0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: 0.3.4
124
+ version: '1.0'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rspec
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -122,7 +136,35 @@ dependencies:
122
136
  - - "~>"
123
137
  - !ruby/object:Gem::Version
124
138
  version: '3.8'
125
- description:
139
+ - !ruby/object:Gem::Dependency
140
+ name: codecov
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.2.12
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.12
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubocop
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.8'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '1.8'
167
+ description:
126
168
  email:
127
169
  - janosch84@gmail.com
128
170
  executables: []
@@ -130,26 +172,36 @@ extensions:
130
172
  - ext/character_set/extconf.rb
131
173
  extra_rdoc_files: []
132
174
  files:
175
+ - ".gitattributes"
176
+ - ".github/workflows/lint.yml"
177
+ - ".github/workflows/tests.yml"
133
178
  - ".gitignore"
134
179
  - ".rspec"
135
- - ".travis.yml"
180
+ - ".rubocop.yml"
136
181
  - BENCHMARK.md
137
182
  - CHANGELOG.md
138
183
  - Gemfile
139
184
  - LICENSE.txt
140
185
  - README.md
141
186
  - Rakefile
187
+ - benchmarks/count_in.rb
142
188
  - benchmarks/cover.rb
143
189
  - benchmarks/delete_in.rb
144
190
  - benchmarks/keep_in.rb
191
+ - benchmarks/scan.rb
145
192
  - benchmarks/shared.rb
146
193
  - benchmarks/used_by.rb
194
+ - benchmarks/z_add.rb
195
+ - benchmarks/z_delete.rb
196
+ - benchmarks/z_merge.rb
197
+ - benchmarks/z_minmax.rb
147
198
  - bin/console
148
199
  - bin/setup
149
200
  - character_set.gemspec
150
201
  - ext/character_set/character_set.c
151
202
  - ext/character_set/extconf.rb
152
203
  - ext/character_set/unicode_casefold_table.h
204
+ - ext/character_set/unicode_casefold_table.h.tmpl
153
205
  - lib/character_set.rb
154
206
  - lib/character_set/character.rb
155
207
  - lib/character_set/core_ext.rb
@@ -158,20 +210,35 @@ files:
158
210
  - lib/character_set/expression_converter.rb
159
211
  - lib/character_set/parser.rb
160
212
  - lib/character_set/predefined_sets.rb
213
+ - lib/character_set/predefined_sets/any.cps
214
+ - lib/character_set/predefined_sets/ascii.cps
215
+ - lib/character_set/predefined_sets/ascii_alnum.cps
216
+ - lib/character_set/predefined_sets/ascii_letter.cps
217
+ - lib/character_set/predefined_sets/assigned.cps
218
+ - lib/character_set/predefined_sets/bmp.cps
219
+ - lib/character_set/predefined_sets/crypt.cps
220
+ - lib/character_set/predefined_sets/emoji.cps
221
+ - lib/character_set/predefined_sets/newline.cps
222
+ - lib/character_set/predefined_sets/surrogate.cps
223
+ - lib/character_set/predefined_sets/unicode.cps
224
+ - lib/character_set/predefined_sets/url_fragment.cps
225
+ - lib/character_set/predefined_sets/url_host.cps
226
+ - lib/character_set/predefined_sets/url_path.cps
227
+ - lib/character_set/predefined_sets/url_query.cps
228
+ - lib/character_set/predefined_sets/whitespace.cps
161
229
  - lib/character_set/pure.rb
162
230
  - lib/character_set/ruby_fallback.rb
163
231
  - lib/character_set/ruby_fallback/character_set_methods.rb
164
- - lib/character_set/ruby_fallback/plane_methods.rb
165
232
  - lib/character_set/ruby_fallback/set_methods.rb
166
233
  - lib/character_set/set_method_adapters.rb
167
234
  - lib/character_set/shared_methods.rb
168
235
  - lib/character_set/version.rb
169
236
  - lib/character_set/writer.rb
170
- homepage: https://github.com/janosch-x/character_set
237
+ homepage: https://github.com/jaynetics/character_set
171
238
  licenses:
172
239
  - MIT
173
240
  metadata: {}
174
- post_install_message:
241
+ post_install_message:
175
242
  rdoc_options: []
176
243
  require_paths:
177
244
  - lib
@@ -186,9 +253,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
186
253
  - !ruby/object:Gem::Version
187
254
  version: '0'
188
255
  requirements: []
189
- rubyforge_project:
190
- rubygems_version: 2.7.6
191
- signing_key:
256
+ rubygems_version: 3.2.3
257
+ signing_key:
192
258
  specification_version: 4
193
259
  summary: Build, read, write and compare sets of Unicode codepoints.
194
260
  test_files: []
@@ -1,11 +0,0 @@
1
- sudo: false
2
- language: ruby
3
- rvm:
4
- - 2.1
5
- - 2.4
6
- - 2.5
7
- - 2.6
8
- - jruby-9.1.9.0
9
- before_install:
10
- - gem update --system
11
- - gem install bundler
@@ -1,27 +0,0 @@
1
- class CharacterSet
2
- module RubyFallback
3
- module PlaneMethods
4
- def bmp_part
5
- dup.keep_if { |cp| cp < 0x10000 }
6
- end
7
-
8
- def astral_part
9
- dup.keep_if { |cp| cp >= 0x10000 }
10
- end
11
-
12
- def planes
13
- plane_set = {}
14
- plane_size = 0x10000.to_f
15
- each do |cp|
16
- plane = (cp / plane_size).floor
17
- plane_set[plane] = true
18
- end
19
- plane_set.keys
20
- end
21
-
22
- def member_in_plane?(num)
23
- ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
24
- end
25
- end
26
- end
27
- end