character_set 1.6.0 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gouteur.yml +1 -1
- data/.github/workflows/lint.yml +1 -1
- data/.github/workflows/tests.yml +3 -1
- data/.rubocop.yml +3 -0
- data/BENCHMARK.md +32 -32
- data/CHANGELOG.md +24 -1
- data/Gemfile +7 -6
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/Rakefile +2 -123
- data/character_set.gemspec +0 -7
- data/ext/character_set/character_set.c +77 -43
- data/lib/character_set/core_ext/regexp_ext.rb +8 -0
- data/lib/character_set/expression_converter.rb +37 -54
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +73 -52
- data/lib/character_set/predefined_sets/emoji.cps +10 -9
- data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
- data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -6
- data/lib/character_set/set_method_adapters.rb +1 -1
- data/lib/character_set/shared_methods.rb +6 -2
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +20 -29
- data/benchmarks/shared.rb +0 -30
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
|
4
|
+
data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
|
7
|
+
data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
|
data/.github/workflows/lint.yml
CHANGED
data/.github/workflows/tests.yml
CHANGED
@@ -12,7 +12,7 @@ jobs:
|
|
12
12
|
|
13
13
|
strategy:
|
14
14
|
matrix:
|
15
|
-
ruby: [ '2.
|
15
|
+
ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
|
16
16
|
|
17
17
|
steps:
|
18
18
|
- uses: actions/checkout@v2
|
@@ -24,3 +24,5 @@ jobs:
|
|
24
24
|
run: bundle install --jobs 4
|
25
25
|
- name: Test with Rake
|
26
26
|
run: bundle exec rake
|
27
|
+
- uses: codecov/codecov-action@v3
|
28
|
+
if: matrix.ruby == '3.2'
|
data/.rubocop.yml
CHANGED
data/BENCHMARK.md
CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
|
|
3
3
|
```
|
4
4
|
Counting non-letters
|
5
5
|
|
6
|
-
CharacterSet#count_in:
|
7
|
-
String#count:
|
6
|
+
CharacterSet#count_in: 14627506.2 i/s
|
7
|
+
String#count: 3859777.0 i/s - 3.79x slower
|
8
8
|
```
|
9
9
|
```
|
10
10
|
Detecting non-whitespace
|
11
11
|
|
12
|
-
CharacterSet#cover?:
|
13
|
-
Regexp#match?:
|
12
|
+
CharacterSet#cover?: 17241902.8 i/s
|
13
|
+
Regexp#match?: 12971122.6 i/s - 1.33x slower
|
14
14
|
```
|
15
15
|
```
|
16
16
|
Detecting non-letters
|
17
17
|
|
18
|
-
CharacterSet#cover?:
|
19
|
-
Regexp#match?:
|
18
|
+
CharacterSet#cover?: 17243472.3 i/s
|
19
|
+
Regexp#match?: 7957626.9 i/s - 2.17x slower
|
20
20
|
```
|
21
21
|
```
|
22
22
|
Removing ASCII whitespace
|
23
23
|
|
24
|
-
CharacterSet#delete_in:
|
25
|
-
String#tr:
|
26
|
-
String#gsub:
|
24
|
+
CharacterSet#delete_in: 6190975.7 i/s
|
25
|
+
String#tr: 4722716.6 i/s - 1.31x slower
|
26
|
+
String#gsub: 214239.5 i/s - 28.90x slower
|
27
27
|
```
|
28
28
|
```
|
29
29
|
Removing whitespace, emoji and umlauts
|
30
30
|
|
31
|
-
CharacterSet#delete_in:
|
32
|
-
String#tr:
|
33
|
-
String#gsub:
|
31
|
+
CharacterSet#delete_in: 5890471.8 i/s
|
32
|
+
String#tr: 348506.8 i/s - 16.90x slower
|
33
|
+
String#gsub: 318268.3 i/s - 18.51x slower
|
34
34
|
```
|
35
35
|
```
|
36
36
|
Removing non-whitespace
|
37
37
|
|
38
|
-
CharacterSet#keep_in:
|
39
|
-
String#gsub:
|
40
|
-
String#tr:
|
38
|
+
CharacterSet#keep_in: 7396898.0 i/s
|
39
|
+
String#gsub: 208809.7 i/s - 35.42x slower
|
40
|
+
String#tr: 13.1 i/s - 564682.50x slower
|
41
41
|
```
|
42
42
|
```
|
43
43
|
Keeping only emoji
|
44
44
|
|
45
|
-
CharacterSet#keep_in:
|
46
|
-
String#gsub:
|
47
|
-
String#tr:
|
45
|
+
CharacterSet#keep_in: 7022741.1 i/s
|
46
|
+
String#gsub: 180939.6 i/s - 38.81x slower
|
47
|
+
String#tr: 13.1 i/s - 536724.50x slower
|
48
48
|
```
|
49
49
|
```
|
50
50
|
Extracting emoji to an Array
|
51
51
|
|
52
|
-
CharacterSet#scan:
|
53
|
-
String#scan:
|
52
|
+
CharacterSet#scan: 3023176.8 i/s
|
53
|
+
String#scan: 893225.8 i/s - 3.38x slower
|
54
54
|
```
|
55
55
|
```
|
56
56
|
Detecting whitespace
|
57
57
|
|
58
|
-
CharacterSet#used_by?:
|
59
|
-
Regexp#match?:
|
58
|
+
CharacterSet#used_by?: 17284025.9 i/s
|
59
|
+
Regexp#match?: 11847064.5 i/s - 1.46x slower
|
60
60
|
```
|
61
61
|
```
|
62
62
|
Detecting emoji in a large string
|
63
63
|
|
64
|
-
CharacterSet#used_by?:
|
65
|
-
Regexp#match?:
|
64
|
+
CharacterSet#used_by?: 341386.1 i/s
|
65
|
+
Regexp#match?: 183121.6 i/s - 1.86x slower
|
66
66
|
```
|
67
67
|
```
|
68
68
|
Adding entries
|
69
69
|
|
70
|
-
CharacterSet#add:
|
71
|
-
SortedSet#add:
|
70
|
+
CharacterSet#add: 4989762.3 i/s
|
71
|
+
SortedSet#add: 1157911.7 i/s - 4.31x slower
|
72
72
|
```
|
73
73
|
```
|
74
74
|
Removing entries
|
75
75
|
|
76
|
-
CharacterSet#delete:
|
77
|
-
SortedSet#delete:
|
76
|
+
CharacterSet#delete: 4996703.6 i/s
|
77
|
+
SortedSet#delete: 4177401.5 i/s - same-ish
|
78
78
|
```
|
79
79
|
```
|
80
80
|
Merging entries
|
81
81
|
|
82
|
-
CharacterSet#merge:
|
83
|
-
SortedSet#merge:
|
82
|
+
CharacterSet#merge: 666.7 i/s
|
83
|
+
SortedSet#merge: 4.0 i/s - 167.84x slower
|
84
84
|
```
|
85
85
|
```
|
86
86
|
Getting the min and max
|
87
87
|
|
88
|
-
CharacterSet#minmax:
|
89
|
-
SortedSet#minmax:
|
88
|
+
CharacterSet#minmax: 1596470.9 i/s
|
89
|
+
SortedSet#minmax: 866.4 i/s - 1842.74x slower
|
90
90
|
```
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,29 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [1.8.0] - 2024-01-07
|
10
|
+
|
11
|
+
### Added
|
12
|
+
|
13
|
+
- support for `#<=>` and `#join`, which were added to `set` in the meantime
|
14
|
+
- support for getting the (overall) character set of a Regexp with multiple expressions
|
15
|
+
- support for global and local case-insensitivity in Regexp inputs
|
16
|
+
- `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
|
17
|
+
|
18
|
+
## [1.7.0] - 2023-05-12
|
19
|
+
|
20
|
+
### Added
|
21
|
+
|
22
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
|
23
|
+
|
24
|
+
### Fixed
|
25
|
+
|
26
|
+
- fixed processing of Strings that are not ASCII- or UTF8-encoded
|
27
|
+
- removed dependency on `set` and `sorted_set`
|
28
|
+
- thanks to https://github.com/mikebaldry for reporting a related issue (#2)
|
29
|
+
|
7
30
|
## [1.6.0] - 2022-02-16
|
8
31
|
|
9
32
|
### Added
|
@@ -63,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
63
86
|
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
64
87
|
- before, every set instance required 136 KB for codepoints
|
65
88
|
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
66
|
-
- `#count_in` and `#
|
89
|
+
- `#count_in` and `#scan` methods for `String` interaction
|
67
90
|
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
68
91
|
- conversion methods `#assigned_part`, `#valid_part`
|
69
92
|
- sectioning methods `#ascii_part`, `#plane(n)`
|
data/Gemfile
CHANGED
@@ -7,14 +7,15 @@ gemspec
|
|
7
7
|
|
8
8
|
gem 'benchmark-ips', '~> 2.7'
|
9
9
|
gem 'get_process_mem', '~> 0.2.3'
|
10
|
-
gem 'rake', '~> 13.
|
10
|
+
gem 'rake', '~> 13.1'
|
11
11
|
gem 'rake-compiler', '~> 1.1'
|
12
12
|
gem 'range_compressor', '~> 1.0'
|
13
|
-
gem 'regexp_parser', '~> 2.
|
14
|
-
gem 'regexp_property_values', '~> 1.
|
13
|
+
gem 'regexp_parser', '~> 2.9'
|
14
|
+
gem 'regexp_property_values', '~> 1.5'
|
15
15
|
gem 'rspec', '~> 3.8'
|
16
|
-
|
17
|
-
|
16
|
+
gem 'warning', '~> 1.3'
|
17
|
+
if RUBY_VERSION.to_f >= 3.0
|
18
18
|
gem 'gouteur', '~> 1.0.0'
|
19
|
-
gem 'rubocop', '~> 1.
|
19
|
+
gem 'rubocop', '~> 1.59'
|
20
|
+
gem 'simplecov-cobertura', require: false
|
20
21
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
|
4
4
|
[![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
|
5
5
|
[![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
|
6
|
-
[![
|
6
|
+
[![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
|
7
7
|
|
8
8
|
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
9
9
|
|
@@ -43,7 +43,7 @@ CharacterSet.parse('[a-c]')
|
|
43
43
|
CharacterSet.parse('\U00000061-\U00000063')
|
44
44
|
```
|
45
45
|
|
46
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read.
|
46
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
|
47
47
|
|
48
48
|
```ruby
|
49
49
|
CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
|
@@ -96,7 +96,7 @@ string # => ''
|
|
96
96
|
|
97
97
|
```ruby
|
98
98
|
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
99
|
-
CharacterSet.non_ascii.
|
99
|
+
CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
|
100
100
|
```
|
101
101
|
|
102
102
|
There is also a core extension for String interaction.
|
data/Rakefile
CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
|
|
3
3
|
require 'rubygems/package_task'
|
4
4
|
require 'rake/extensiontask'
|
5
5
|
|
6
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
7
|
+
|
6
8
|
RSpec::Core::RakeTask.new(:spec)
|
7
9
|
|
8
10
|
task default: :spec
|
@@ -34,129 +36,6 @@ end
|
|
34
36
|
|
35
37
|
task package: 'java:gem'
|
36
38
|
|
37
|
-
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
38
|
-
task :sync_ruby_spec do
|
39
|
-
require 'fileutils'
|
40
|
-
|
41
|
-
variants = {
|
42
|
-
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
43
|
-
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
44
|
-
}
|
45
|
-
|
46
|
-
# download fresh specs from ruby/spec repository
|
47
|
-
variants.each do |_, dir|
|
48
|
-
FileUtils.rm_rf(dir) if File.exist?(dir)
|
49
|
-
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
50
|
-
end
|
51
|
-
|
52
|
-
# make copies for each CharacterSet variant
|
53
|
-
base = variants.first[1]
|
54
|
-
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
55
|
-
|
56
|
-
# adapt specs to work with CharacterSet
|
57
|
-
variants.each do |class_name, dir|
|
58
|
-
Dir["#{dir}/**/*.rb"].each do |spec|
|
59
|
-
# ignore some tests that do not apply or are covered otherwise
|
60
|
-
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
61
|
-
File.delete(spec)
|
62
|
-
next
|
63
|
-
end
|
64
|
-
|
65
|
-
adapted_content =
|
66
|
-
File.read(spec).
|
67
|
-
# adapt class name
|
68
|
-
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
69
|
-
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
70
|
-
# get shared specs from a single shared dir at the parent level
|
71
|
-
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
72
|
-
# make 'mspec' syntax rspec-compatible
|
73
|
-
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
74
|
-
gsub(/be_(false|true)/, 'be \1').
|
75
|
-
gsub('stub!', 'stub').
|
76
|
-
gsub('mock', 'double').
|
77
|
-
gsub('@method', 'method').
|
78
|
-
# remove unneeded requires
|
79
|
-
gsub(/require 'set'\n/, '').
|
80
|
-
gsub(/require.*spec_helper.*\n/, '').
|
81
|
-
gsub(/\A\n+/, '').
|
82
|
-
# make examples use Integers/codepoints
|
83
|
-
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
84
|
-
gsub('"one"', '1').
|
85
|
-
gsub('"two"', '2').
|
86
|
-
gsub('"three"', '3').
|
87
|
-
gsub('"four"', '4').
|
88
|
-
gsub('"five"', '5').
|
89
|
-
gsub(/x.(size|length) == 3/, 'x != 3').
|
90
|
-
gsub(/x.(size|length) != 3/, 'x == 3').
|
91
|
-
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
92
|
-
|
93
|
-
File.open(spec, 'w') { |f| f.puts adapted_content }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# keep only one copy of the shared specs, at the parent level
|
98
|
-
FileUtils.rm_rf(base + '/../shared')
|
99
|
-
FileUtils.mv(base + '/shared', base + '/../')
|
100
|
-
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
101
|
-
end
|
102
|
-
|
103
|
-
desc 'Download unicode casefold data and write new C header file'
|
104
|
-
task :sync_casefold_data do
|
105
|
-
src_path = './CaseFolding.txt'
|
106
|
-
dst_path = './ext/character_set/unicode_casefold_table.h'
|
107
|
-
|
108
|
-
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
109
|
-
|
110
|
-
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
111
|
-
from, type, to = line.split(/\s*;\s*/).first(3)
|
112
|
-
# type 'C' stands for 'common', excludes mappings to multiple chars
|
113
|
-
hash[from] = to if type == 'C'
|
114
|
-
end.sort
|
115
|
-
|
116
|
-
content = File.read(dst_path + '.tmpl')
|
117
|
-
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
118
|
-
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
119
|
-
|
120
|
-
File.write(dst_path, content)
|
121
|
-
File.unlink(src_path)
|
122
|
-
end
|
123
|
-
|
124
|
-
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
125
|
-
task :sync_predefined_sets do
|
126
|
-
%w[assigned emoji whitespace].each do |prop|
|
127
|
-
require 'regexp_property_values'
|
128
|
-
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
|
-
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
desc 'Run all IPS benchmarks'
|
135
|
-
task :benchmark do
|
136
|
-
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
137
|
-
end
|
138
|
-
|
139
|
-
namespace :benchmark do
|
140
|
-
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
141
|
-
task :write_to_file do
|
142
|
-
$store_comparison_results = {}
|
143
|
-
|
144
|
-
Rake.application[:benchmark].invoke
|
145
|
-
|
146
|
-
File.open('BENCHMARK.md', 'w') do |f|
|
147
|
-
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
148
|
-
|
149
|
-
$store_comparison_results.each do |caption, result|
|
150
|
-
f.puts '```',
|
151
|
-
caption,
|
152
|
-
'',
|
153
|
-
result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
|
154
|
-
'```'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
39
|
unless RUBY_PLATFORM =~ /java/
|
161
40
|
# recompile before benchmarking or running specs
|
162
41
|
task(:benchmark).enhance([:compile])
|
data/character_set.gemspec
CHANGED
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
|
|
21
21
|
s.extensions = %w[ext/character_set/extconf.rb]
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
|
-
|
25
|
-
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
26
|
-
# This dependency is only used if the C extension is unavailable.
|
27
|
-
# JRuby has it in the stdlib.
|
28
|
-
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
|
-
s.add_dependency 'sorted_set', '~> 1.0'
|
30
|
-
end
|
31
24
|
end
|