character_set 1.6.0-java → 1.8.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/gouteur.yml +1 -1
  3. data/.github/workflows/lint.yml +1 -1
  4. data/.github/workflows/tests.yml +3 -1
  5. data/.rubocop.yml +3 -0
  6. data/BENCHMARK.md +32 -32
  7. data/CHANGELOG.md +24 -1
  8. data/Gemfile +7 -6
  9. data/LICENSE.txt +1 -1
  10. data/README.md +3 -3
  11. data/Rakefile +2 -123
  12. data/character_set.gemspec +0 -7
  13. data/ext/character_set/character_set.c +77 -43
  14. data/lib/character_set/core_ext/regexp_ext.rb +8 -0
  15. data/lib/character_set/expression_converter.rb +37 -54
  16. data/lib/character_set/parser.rb +8 -4
  17. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  18. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  19. data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
  20. data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
  21. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  22. data/lib/character_set/ruby_fallback.rb +18 -6
  23. data/lib/character_set/set_method_adapters.rb +1 -1
  24. data/lib/character_set/shared_methods.rb +6 -2
  25. data/lib/character_set/version.rb +1 -1
  26. data/tasks/benchmark.rake +20 -0
  27. data/tasks/benchmarks/shared.rb +28 -0
  28. data/tasks/sync_casefold_data.rake +20 -0
  29. data/tasks/sync_predefined_sets.rake +9 -0
  30. data/tasks/sync_ruby_spec.rake +65 -0
  31. metadata +19 -28
  32. data/benchmarks/shared.rb +0 -30
  33. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  34. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  35. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  36. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  37. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  38. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  39. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  40. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  41. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  42. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ccdb4b8c43fdf0e3ed297415810a92805ee61324d91c464ccc7bee2575cb14b
4
- data.tar.gz: cc36d5c8d65c036981bd368577bf49eb59d2d91f5fb779cae2e42aa991c94610
3
+ metadata.gz: 129c77fada8076c0c1fe5a69e46e14c6987ace8c165befae162d6b60f3fde7af
4
+ data.tar.gz: 0e29cd75a1e4d8b2951a5d47bfabe6d1f11e9cdc2f9c121d8697468345881b0b
5
5
  SHA512:
6
- metadata.gz: 35db3bcae78967f9d74beebccb3412a08ff505ead29fb3ef2c1bef05a5ebc5815c16ea948a3c831bb92041493f12502e86c3e01b64d19bfe8baba6dfb89c88e4
7
- data.tar.gz: de08614dabac51cc9c71b5ed8a96d2865cab747fcb27ca3b00b0d75efcedf7f7f0f81bdd0472c275901d09258af137ddaf5150d9fea067a79f5fd6b526aa2ac5
6
+ metadata.gz: 1ca88a71a54608e1b9dc62fe42719ad1892bd9a29365a3c3ac5dd0002d04efc1544172aa1a191a2dc0367bd3d9f0e707d3a4c1500d0851ff3e9d4b4e7d27cd04
7
+ data.tar.gz: 7f8787943174d09d691e484d7f135254cb24f93a94c33d0cc5e21989109600c5e1030a253d3734bea8a461d4691455a919a50bb9c511254dc64fb0d65081c43b
@@ -11,7 +11,7 @@ jobs:
11
11
  - name: Set up Ruby
12
12
  uses: ruby/setup-ruby@v1
13
13
  with:
14
- ruby-version: 2.7
14
+ ruby-version: 3.3
15
15
  - name: Prepare
16
16
  run: |
17
17
  bundle install --jobs 4
@@ -13,7 +13,7 @@ jobs:
13
13
  - name: Set up Ruby
14
14
  uses: ruby/setup-ruby@v1
15
15
  with:
16
- ruby-version: 2.7
16
+ ruby-version: 3.3
17
17
  - name: Cache gems
18
18
  uses: actions/cache@v1
19
19
  with:
@@ -12,7 +12,7 @@ jobs:
12
12
 
13
13
  strategy:
14
14
  matrix:
15
- ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
15
+ ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
16
16
 
17
17
  steps:
18
18
  - uses: actions/checkout@v2
@@ -24,3 +24,5 @@ jobs:
24
24
  run: bundle install --jobs 4
25
25
  - name: Test with Rake
26
26
  run: bundle exec rake
27
+ - uses: codecov/codecov-action@v3
28
+ if: matrix.ruby == '3.2'
data/.rubocop.yml CHANGED
@@ -15,3 +15,6 @@ Lint/AmbiguousOperatorPrecedence:
15
15
 
16
16
  Lint/AmbiguousRegexpLiteral:
17
17
  Enabled: false
18
+
19
+ Metrics:
20
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 14794607.9 i/s
7
- String#count: 3875939.3 i/s - 3.82x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 17448329.0 i/s
13
- Regexp#match?: 13089358.1 i/s - 1.33x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 17565596.9 i/s
19
- Regexp#match?: 7951108.0 i/s - 2.21x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
22
  Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 6306078.2 i/s
25
- String#tr: 4734401.0 i/s - 1.33x slower
26
- String#gsub: 211631.8 i/s - 29.80x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
27
27
  ```
28
28
  ```
29
29
  Removing whitespace, emoji and umlauts
30
30
 
31
- CharacterSet#delete_in: 5984149.6 i/s
32
- String#tr: 363643.1 i/s - 16.46x slower
33
- String#gsub: 317201.7 i/s - 18.87x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
34
34
  ```
35
35
  ```
36
36
  Removing non-whitespace
37
37
 
38
- CharacterSet#keep_in: 7650925.6 i/s
39
- String#gsub: 207374.6 i/s - 36.89x slower
40
- String#tr: 12.3 i/s - 619745.60x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
41
41
  ```
42
42
  ```
43
43
  Keeping only emoji
44
44
 
45
- CharacterSet#keep_in: 7272940.1 i/s
46
- String#gsub: 177993.8 i/s - 40.86x slower
47
- String#tr: 12.3 i/s - 590222.71x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
48
48
  ```
49
49
  ```
50
50
  Extracting emoji to an Array
51
51
 
52
- CharacterSet#scan: 2978285.0 i/s
53
- String#scan: 865793.8 i/s - 3.44x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
54
54
  ```
55
55
  ```
56
56
  Detecting whitespace
57
57
 
58
- CharacterSet#used_by?: 17292338.4 i/s
59
- Regexp#match?: 11705563.9 i/s - 1.48x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
60
60
  ```
61
61
  ```
62
62
  Detecting emoji in a large string
63
63
 
64
- CharacterSet#used_by?: 340444.1 i/s
65
- Regexp#match?: 180549.8 i/s - 1.89x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
66
66
  ```
67
67
  ```
68
68
  Adding entries
69
69
 
70
- CharacterSet#add: 4951781.4 i/s
71
- SortedSet#add: 1019637.9 i/s - 4.86x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
72
72
  ```
73
73
  ```
74
74
  Removing entries
75
75
 
76
- CharacterSet#delete: 5006337.6 i/s
77
- SortedSet#delete: 3922752.2 i/s - same-ish
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
78
78
  ```
79
79
  ```
80
80
  Merging entries
81
81
 
82
- CharacterSet#merge: 661.8 i/s
83
- SortedSet#merge: 3.9 i/s - 167.82x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
84
84
  ```
85
85
  ```
86
86
  Getting the min and max
87
87
 
88
- CharacterSet#minmax: 1212462.2 i/s
89
- SortedSet#minmax: 844.4 i/s - 1435.93x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
90
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,29 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.8.0] - 2024-01-07
10
+
11
+ ### Added
12
+
13
+ - support for `#<=>` and `#join`, which were added to `set` in the meantime
14
+ - support for getting the (overall) character set of a Regexp with multiple expressions
15
+ - support for global and local case-insensitivity in Regexp inputs
16
+ - `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
17
+
18
+ ## [1.7.0] - 2023-05-12
19
+
20
+ ### Added
21
+
22
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
23
+
24
+ ### Fixed
25
+
26
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
27
+ - removed dependency on `set` and `sorted_set`
28
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
29
+
7
30
  ## [1.6.0] - 2022-02-16
8
31
 
9
32
  ### Added
@@ -63,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
63
86
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
64
87
  - before, every set instance required 136 KB for codepoints
65
88
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
66
- - `#count_in` and `#scan_in` methods for `String` interaction
89
+ - `#count_in` and `#scan` methods for `String` interaction
67
90
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
68
91
  - conversion methods `#assigned_part`, `#valid_part`
69
92
  - sectioning methods `#ascii_part`, `#plane(n)`
data/Gemfile CHANGED
@@ -7,14 +7,15 @@ gemspec
7
7
 
8
8
  gem 'benchmark-ips', '~> 2.7'
9
9
  gem 'get_process_mem', '~> 0.2.3'
10
- gem 'rake', '~> 13.0'
10
+ gem 'rake', '~> 13.1'
11
11
  gem 'rake-compiler', '~> 1.1'
12
12
  gem 'range_compressor', '~> 1.0'
13
- gem 'regexp_parser', '~> 2.1'
14
- gem 'regexp_property_values', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.9'
14
+ gem 'regexp_property_values', '~> 1.5'
15
15
  gem 'rspec', '~> 3.8'
16
- if RUBY_VERSION.to_f >= 2.7
17
- gem 'codecov', '~> 0.2.12'
16
+ gem 'warning', '~> 1.3'
17
+ if RUBY_VERSION.to_f >= 3.0
18
18
  gem 'gouteur', '~> 1.0.0'
19
- gem 'rubocop', '~> 1.8'
19
+ gem 'rubocop', '~> 1.59'
20
+ gem 'simplecov-cobertura', require: false
20
21
  end
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2018 Janosch Müller
3
+ Copyright (c) 2018-2023 Janosch Müller
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
5
  [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
- [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
6
+ [![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
7
7
 
8
8
  This is a C-extended Ruby gem to work with sets of Unicode codepoints.
9
9
 
@@ -43,7 +43,7 @@ CharacterSet.parse('[a-c]')
43
43
  CharacterSet.parse('\U00000061-\U00000063')
44
44
  ```
45
45
 
46
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
47
47
 
48
48
  ```ruby
49
49
  CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
@@ -96,7 +96,7 @@ string # => ''
96
96
 
97
97
  ```ruby
98
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
99
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
100
100
  ```
101
101
 
102
102
  There is also a core extension for String interaction.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,129 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```',
151
- caption,
152
- '',
153
- result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
- '```'
155
- end
156
- end
157
- end
158
- end
159
-
160
39
  unless RUBY_PLATFORM =~ /java/
161
40
  # recompile before benchmarking or running specs
162
41
  task(:benchmark).enhance([:compile])
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
- # This dependency is only used if the C extension is unavailable.
27
- # JRuby has it in the stdlib.
28
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
- s.add_dependency 'sorted_set', '~> 1.0'
30
- end
31
24
  end