character_set 1.6.0 → 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/gouteur.yml +1 -1
  3. data/.github/workflows/lint.yml +1 -1
  4. data/.github/workflows/tests.yml +3 -1
  5. data/.rubocop.yml +3 -0
  6. data/BENCHMARK.md +32 -32
  7. data/CHANGELOG.md +24 -1
  8. data/Gemfile +7 -6
  9. data/LICENSE.txt +1 -1
  10. data/README.md +3 -3
  11. data/Rakefile +2 -123
  12. data/character_set.gemspec +0 -7
  13. data/ext/character_set/character_set.c +77 -43
  14. data/lib/character_set/core_ext/regexp_ext.rb +8 -0
  15. data/lib/character_set/expression_converter.rb +37 -54
  16. data/lib/character_set/parser.rb +8 -4
  17. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  18. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  19. data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
  20. data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
  21. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  22. data/lib/character_set/ruby_fallback.rb +18 -6
  23. data/lib/character_set/set_method_adapters.rb +1 -1
  24. data/lib/character_set/shared_methods.rb +6 -2
  25. data/lib/character_set/version.rb +1 -1
  26. data/tasks/benchmark.rake +20 -0
  27. data/tasks/benchmarks/shared.rb +28 -0
  28. data/tasks/sync_casefold_data.rake +20 -0
  29. data/tasks/sync_predefined_sets.rake +9 -0
  30. data/tasks/sync_ruby_spec.rake +65 -0
  31. metadata +20 -29
  32. data/benchmarks/shared.rb +0 -30
  33. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  34. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  35. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  36. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  37. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  38. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  39. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  40. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  41. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  42. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
4
- data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
3
+ metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
4
+ data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
5
5
  SHA512:
6
- metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
7
- data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
6
+ metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
7
+ data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
@@ -11,7 +11,7 @@ jobs:
11
11
  - name: Set up Ruby
12
12
  uses: ruby/setup-ruby@v1
13
13
  with:
14
- ruby-version: 2.7
14
+ ruby-version: 3.3
15
15
  - name: Prepare
16
16
  run: |
17
17
  bundle install --jobs 4
@@ -13,7 +13,7 @@ jobs:
13
13
  - name: Set up Ruby
14
14
  uses: ruby/setup-ruby@v1
15
15
  with:
16
- ruby-version: 2.7
16
+ ruby-version: 3.3
17
17
  - name: Cache gems
18
18
  uses: actions/cache@v1
19
19
  with:
@@ -12,7 +12,7 @@ jobs:
12
12
 
13
13
  strategy:
14
14
  matrix:
15
- ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
15
+ ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
16
16
 
17
17
  steps:
18
18
  - uses: actions/checkout@v2
@@ -24,3 +24,5 @@ jobs:
24
24
  run: bundle install --jobs 4
25
25
  - name: Test with Rake
26
26
  run: bundle exec rake
27
+ - uses: codecov/codecov-action@v3
28
+ if: matrix.ruby == '3.2'
data/.rubocop.yml CHANGED
@@ -15,3 +15,6 @@ Lint/AmbiguousOperatorPrecedence:
15
15
 
16
16
  Lint/AmbiguousRegexpLiteral:
17
17
  Enabled: false
18
+
19
+ Metrics:
20
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 14794607.9 i/s
7
- String#count: 3875939.3 i/s - 3.82x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 17448329.0 i/s
13
- Regexp#match?: 13089358.1 i/s - 1.33x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 17565596.9 i/s
19
- Regexp#match?: 7951108.0 i/s - 2.21x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
22
  Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 6306078.2 i/s
25
- String#tr: 4734401.0 i/s - 1.33x slower
26
- String#gsub: 211631.8 i/s - 29.80x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
27
27
  ```
28
28
  ```
29
29
  Removing whitespace, emoji and umlauts
30
30
 
31
- CharacterSet#delete_in: 5984149.6 i/s
32
- String#tr: 363643.1 i/s - 16.46x slower
33
- String#gsub: 317201.7 i/s - 18.87x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
34
34
  ```
35
35
  ```
36
36
  Removing non-whitespace
37
37
 
38
- CharacterSet#keep_in: 7650925.6 i/s
39
- String#gsub: 207374.6 i/s - 36.89x slower
40
- String#tr: 12.3 i/s - 619745.60x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
41
41
  ```
42
42
  ```
43
43
  Keeping only emoji
44
44
 
45
- CharacterSet#keep_in: 7272940.1 i/s
46
- String#gsub: 177993.8 i/s - 40.86x slower
47
- String#tr: 12.3 i/s - 590222.71x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
48
48
  ```
49
49
  ```
50
50
  Extracting emoji to an Array
51
51
 
52
- CharacterSet#scan: 2978285.0 i/s
53
- String#scan: 865793.8 i/s - 3.44x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
54
54
  ```
55
55
  ```
56
56
  Detecting whitespace
57
57
 
58
- CharacterSet#used_by?: 17292338.4 i/s
59
- Regexp#match?: 11705563.9 i/s - 1.48x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
60
60
  ```
61
61
  ```
62
62
  Detecting emoji in a large string
63
63
 
64
- CharacterSet#used_by?: 340444.1 i/s
65
- Regexp#match?: 180549.8 i/s - 1.89x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
66
66
  ```
67
67
  ```
68
68
  Adding entries
69
69
 
70
- CharacterSet#add: 4951781.4 i/s
71
- SortedSet#add: 1019637.9 i/s - 4.86x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
72
72
  ```
73
73
  ```
74
74
  Removing entries
75
75
 
76
- CharacterSet#delete: 5006337.6 i/s
77
- SortedSet#delete: 3922752.2 i/s - same-ish
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
78
78
  ```
79
79
  ```
80
80
  Merging entries
81
81
 
82
- CharacterSet#merge: 661.8 i/s
83
- SortedSet#merge: 3.9 i/s - 167.82x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
84
84
  ```
85
85
  ```
86
86
  Getting the min and max
87
87
 
88
- CharacterSet#minmax: 1212462.2 i/s
89
- SortedSet#minmax: 844.4 i/s - 1435.93x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
90
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,29 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.8.0] - 2024-01-07
10
+
11
+ ### Added
12
+
13
+ - support for `#<=>` and `#join`, which were added to `set` in the meantime
14
+ - support for getting the (overall) character set of a Regexp with multiple expressions
15
+ - support for global and local case-insensitivity in Regexp inputs
16
+ - `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
17
+
18
+ ## [1.7.0] - 2023-05-12
19
+
20
+ ### Added
21
+
22
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
23
+
24
+ ### Fixed
25
+
26
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
27
+ - removed dependency on `set` and `sorted_set`
28
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
29
+
7
30
  ## [1.6.0] - 2022-02-16
8
31
 
9
32
  ### Added
@@ -63,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
63
86
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
64
87
  - before, every set instance required 136 KB for codepoints
65
88
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
66
- - `#count_in` and `#scan_in` methods for `String` interaction
89
+ - `#count_in` and `#scan` methods for `String` interaction
67
90
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
68
91
  - conversion methods `#assigned_part`, `#valid_part`
69
92
  - sectioning methods `#ascii_part`, `#plane(n)`
data/Gemfile CHANGED
@@ -7,14 +7,15 @@ gemspec
7
7
 
8
8
  gem 'benchmark-ips', '~> 2.7'
9
9
  gem 'get_process_mem', '~> 0.2.3'
10
- gem 'rake', '~> 13.0'
10
+ gem 'rake', '~> 13.1'
11
11
  gem 'rake-compiler', '~> 1.1'
12
12
  gem 'range_compressor', '~> 1.0'
13
- gem 'regexp_parser', '~> 2.1'
14
- gem 'regexp_property_values', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.9'
14
+ gem 'regexp_property_values', '~> 1.5'
15
15
  gem 'rspec', '~> 3.8'
16
- if RUBY_VERSION.to_f >= 2.7
17
- gem 'codecov', '~> 0.2.12'
16
+ gem 'warning', '~> 1.3'
17
+ if RUBY_VERSION.to_f >= 3.0
18
18
  gem 'gouteur', '~> 1.0.0'
19
- gem 'rubocop', '~> 1.8'
19
+ gem 'rubocop', '~> 1.59'
20
+ gem 'simplecov-cobertura', require: false
20
21
  end
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2018 Janosch Müller
3
+ Copyright (c) 2018-2023 Janosch Müller
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
5
  [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
- [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
6
+ [![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
7
7
 
8
8
  This is a C-extended Ruby gem to work with sets of Unicode codepoints.
9
9
 
@@ -43,7 +43,7 @@ CharacterSet.parse('[a-c]')
43
43
  CharacterSet.parse('\U00000061-\U00000063')
44
44
  ```
45
45
 
46
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
47
47
 
48
48
  ```ruby
49
49
  CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
@@ -96,7 +96,7 @@ string # => ''
96
96
 
97
97
  ```ruby
98
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
99
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
100
100
  ```
101
101
 
102
102
  There is also a core extension for String interaction.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,129 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```',
151
- caption,
152
- '',
153
- result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
- '```'
155
- end
156
- end
157
- end
158
- end
159
-
160
39
  unless RUBY_PLATFORM =~ /java/
161
40
  # recompile before benchmarking or running specs
162
41
  task(:benchmark).enhance([:compile])
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
- # This dependency is only used if the C extension is unavailable.
27
- # JRuby has it in the stdlib.
28
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
- s.add_dependency 'sorted_set', '~> 1.0'
30
- end
31
24
  end