character_set 1.2.0-java → 1.5.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +22 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +17 -0
  9. data/BENCHMARK.md +53 -17
  10. data/CHANGELOG.md +54 -0
  11. data/README.md +51 -12
  12. data/Rakefile +20 -18
  13. data/benchmarks/count_in.rb +13 -0
  14. data/benchmarks/delete_in.rb +1 -1
  15. data/benchmarks/scan.rb +13 -0
  16. data/benchmarks/shared.rb +5 -0
  17. data/benchmarks/z_add.rb +12 -0
  18. data/benchmarks/z_delete.rb +12 -0
  19. data/benchmarks/z_merge.rb +15 -0
  20. data/benchmarks/z_minmax.rb +12 -0
  21. data/bin/console +2 -0
  22. data/character_set.gemspec +17 -4
  23. data/ext/character_set/character_set.c +969 -415
  24. data/ext/character_set/unicode_casefold_table.h +44 -1
  25. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  26. data/lib/character_set/character.rb +1 -1
  27. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  28. data/lib/character_set/core_ext/string_ext.rb +3 -1
  29. data/lib/character_set/expression_converter.rb +41 -43
  30. data/lib/character_set/parser.rb +1 -1
  31. data/lib/character_set/predefined_sets/any.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  33. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  34. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  35. data/lib/character_set/predefined_sets/assigned.cps +677 -0
  36. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  37. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  38. data/lib/character_set/predefined_sets/emoji.cps +152 -0
  39. data/lib/character_set/predefined_sets/newline.cps +3 -0
  40. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  41. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  42. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  43. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  44. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  45. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  46. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  47. data/lib/character_set/predefined_sets.rb +25 -260
  48. data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
  49. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  50. data/lib/character_set/ruby_fallback.rb +5 -3
  51. data/lib/character_set/set_method_adapters.rb +4 -3
  52. data/lib/character_set/shared_methods.rb +69 -50
  53. data/lib/character_set/version.rb +1 -1
  54. data/lib/character_set/writer.rb +98 -27
  55. metadata +114 -17
  56. data/.travis.yml +0 -8
  57. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 438cb1162ff31a5413f0441ac9c976c1697b9bc8a817e9a1ed78ce08d8f63d56
4
- data.tar.gz: 55988d116eaa4b2e5db705703d918409b65971bc193f7d11ef2d9a0a8a93e649
3
+ metadata.gz: 52823f4f35cdec44378c3828b4b38eba1f9f5bce402a70962eae1fb786132d8c
4
+ data.tar.gz: b1d6419575a3614675c194cbfde8530be02195cc73365a8c6ca446dd6ea909e6
5
5
  SHA512:
6
- metadata.gz: eeabb985966b1b60a5264b1cc8692f496472cb96c29649dc40b60321727bb1c98921c87ad0b229e711e9ae49850840a52f7ee9a317cd54262531569d0267b320
7
- data.tar.gz: 9c02685dd05d2e3563f83c04879f8e6ed0f0c4cbfeef6259135d313f847d76ceda53a1b64ba34cc121a34ab12231303dc539322d49f68c62bfc657161a1cd0ad
6
+ metadata.gz: c021975f912100174a5274454cfc6099a0955262e7e5fac619989a0a2aa5d624e048fe8b5f68b167157aca425c771df1bf137be12924b609b8d30dff1608142e
7
+ data.tar.gz: 1b702ea538bc5a5209c3544c88c9b38d328080db52640bb4a5780454d296970d8c2557ebe9c3cdd014e3a20af254c77fe694a9f56f09f3f29c039aef81dc381f
data/.gitattributes ADDED
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 2.7
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 2.7
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ matrix:
11
+ ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
12
+
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ - name: Set up Ruby ${{ matrix.ruby }}
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: ${{ matrix.ruby }}
19
+ - name: Install dependencies
20
+ run: bundle install --jobs 4
21
+ - name: Test with Rake
22
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml ADDED
@@ -0,0 +1,17 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,50 +1,86 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 9472902.2 i/s
7
+ String#count: 2221799.9 i/s - 4.26x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 12388427.2 i/s
13
+ Regexp#match?: 7901676.8 i/s - 1.57x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 12263689.1 i/s
19
+ Regexp#match?: 4940889.9 i/s - 2.48x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2406722.6 i/s
25
+ String#gsub: 235760.3 i/s - 10.21x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1653607.6 i/s
31
+ String#gsub: 272782.9 i/s - 6.06x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2671038.2 i/s
37
+ String#gsub: 242551.0 i/s - 11.01x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1726496.5 i/s
43
+ String#gsub: 215609.2 i/s - 8.01x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2373856.1 i/s
49
+ String#scan: 480000.5 i/s - 4.95x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 11988328.7 i/s
55
+ Regexp#match?: 6758146.8 i/s - 1.77x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
47
59
 
48
- CharacterSet#used_by?: 246527.7 i/s
49
- Regexp#match?: 92956.5 i/s - 2.65x slower
60
+ CharacterSet#used_by?: 288223.3 i/s
61
+ Regexp#match?: 102384.2 i/s - 2.82x slower
62
+ ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 2538251.2 i/s
67
+ SortedSet#add: 443925.9 i/s - 5.72x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 2487620.8 i/s
73
+ SortedSet#delete: 628816.1 i/s - 3.96x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 551.6 i/s
79
+ SortedSet#merge: 1.4 i/s - 393.59x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 636890.7 i/s
85
+ SortedSet#minmax: 254.1 i/s - 2506.20x slower
50
86
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,60 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.5.0] - 2021-12-05
8
+
9
+ ### Added
10
+
11
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
12
+ - latest unicode case-folding data (for `#case_insensitive`)
13
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
14
+ - this matches recent broadening of these methods in `ruby/set`
15
+ - new instance method `#secure_token` (see README)
16
+ - class method `::of` now accepts more than one `String`
17
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
18
+
19
+ ### Fixed
20
+
21
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
22
+ - it used to return a regular `CharacterSet`
23
+
24
+ ## [1.4.1] - 2020-01-10
25
+
26
+ ### Fixed
27
+ - multiple fixes for Ruby 3
28
+ - fixed segfault for some `String` manipulation cases
29
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
30
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
31
+
32
+ ## [1.4.0] - 2019-06-07
33
+
34
+ ### Added
35
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
36
+ - allows for much shorter astral plane representations e.g. in JavaScript
37
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
38
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
39
+
40
+ ### Fixed
41
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
42
+
43
+ ## [1.3.0] - 2019-04-26
44
+
45
+ ### Added
46
+ - improved `String` manipulation speed
47
+ - improved initialization and `#merge` speed when passing a large `Range`
48
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
49
+ - before, every set instance required 136 KB for codepoints
50
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
51
+ - `#count_in` and `#scan_in` methods for `String` interaction
52
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
53
+ - conversion methods `#assigned_part`, `#valid_part`
54
+ - sectioning methods `#ascii_part`, `#plane(n)`
55
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
56
+
57
+ ### Fixed
58
+ - `#count` now supports passing an argument or block as usual
59
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
60
+
7
61
  ## [1.2.0] - 2019-04-02
8
62
 
9
63
  ### Added
data/README.md CHANGED
@@ -1,12 +1,17 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
7
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
9
+
10
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
11
 
8
12
  Many parts can be used independently, e.g.:
9
13
  - `CharacterSet::Character`
14
+ - `CharacterSet::ExpressionConverter`
10
15
  - `CharacterSet::Parser`
11
16
  - `CharacterSet::Writer`
12
17
  - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
@@ -37,7 +42,7 @@ CharacterSet.parse('[a-c]')
37
42
  CharacterSet.parse('\U00000061-\U00000063')
38
43
  ```
39
44
 
40
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
45
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
41
46
 
42
47
  ```ruby
43
48
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +54,7 @@ require 'character_set/core_ext/regexp_ext'
49
54
 
50
55
  ### Predefined utility sets
51
56
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
57
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
58
 
54
59
  ```ruby
55
60
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +65,7 @@ CharacterSet.non_ascii
60
65
 
61
66
  ### Interact with Strings
62
67
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
68
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
69
 
65
70
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
71
 
@@ -71,6 +76,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
76
  ```
72
77
 
73
78
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
79
+
74
80
  ```ruby
75
81
  string = 'Tüür'
76
82
 
@@ -84,6 +90,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
90
  string # => ''
85
91
  ```
86
92
 
93
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
94
+
95
+ ```ruby
96
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
97
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
98
+ ```
99
+
87
100
  There is also a core extension for String interaction.
88
101
  ```ruby
89
102
  require 'character_set/core_ext/string_ext'
@@ -100,7 +113,7 @@ require 'character_set/core_ext/string_ext'
100
113
 
101
114
  ### Manipulate
102
115
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
116
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
117
 
105
118
  Where appropriate, methods take both chars and codepoints, e.g.:
106
119
 
@@ -122,13 +135,13 @@ non_a.include?('ü') # => true
122
135
 
123
136
  # surrogate pair halves are not included by default
124
137
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
138
+ # => #<CharacterSet (size: 1114112)>
126
139
  ```
127
140
 
128
141
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
142
 
130
143
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
144
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
145
  ```
133
146
 
134
147
  ### Write
@@ -156,18 +169,44 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
156
169
  # disable abbreviation (grouping of codepoints in ranges)
157
170
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
171
 
159
- # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
172
+ # astral members require some trickery if we want to target environments
173
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
174
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
175
+
176
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
177
+ set.to_s_with_surrogate_ranges
178
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
179
+
180
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
181
+ # don't work in your target environment:
182
+ set.to_s_with_surrogate_alternation
183
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
184
+ ```
185
+
186
+ ### Other features
187
+
188
+ #### Secure tokens
189
+
190
+ Generate secure random strings of characters from a set:
191
+
192
+ ```ruby
193
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
194
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
161
195
  ```
162
196
 
163
- ### Unicode plane methods
197
+ #### Unicode planes
164
198
 
165
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
199
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
166
200
  ```Ruby
201
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
202
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
203
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
204
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
167
205
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
168
206
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
169
207
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
170
208
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
209
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
171
210
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
172
211
  CharacterSet::Character.new('a').plane # => 0
173
212
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -106,27 +113,22 @@ task :sync_casefold_data do
106
113
  hash[from] = to if type == 'C'
107
114
  end.sort
108
115
 
109
- File.open(dst_path, 'w') do |f|
110
- f.puts <<-C
111
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
112
- // -*-C-*-
113
-
114
- typedef struct casefold_mapping {
115
- unsigned long from;
116
- unsigned long to;
117
- } casefold_mapping;
118
-
119
- #define CASEFOLD_COUNT #{mapping.size}
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
120
119
 
121
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
122
- C
123
-
124
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
125
123
 
126
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
127
131
  end
128
-
129
- File.unlink(src_path)
130
132
  end
131
133
 
132
134
  desc 'Run all IPS benchmarks'
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
data/benchmarks/shared.rb CHANGED
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
7
+ require 'sorted_set'
8
+ else
9
+ require 'set'
10
+ end
6
11
 
7
12
  def benchmark(caption: nil, cases: {})
8
13
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
data/bin/console CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -22,11 +22,24 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
+ # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
+ # This dependency is only used if the C extension is unavailable.
27
+ # JRuby has it in the stdlib.
28
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
+ s.add_dependency 'sorted_set', '~> 1.0'
30
+ end
31
+
25
32
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'rake', '~> 12.0'
27
- s.add_development_dependency 'rake-compiler', '~> 1.0'
33
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
+ s.add_development_dependency 'rake', '~> 13.0'
35
+ s.add_development_dependency 'rake-compiler', '~> 1.1'
28
36
  s.add_development_dependency 'range_compressor', '~> 1.0'
29
- s.add_development_dependency 'regexp_parser', '~> 1.3'
30
- s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
37
+ s.add_development_dependency 'regexp_parser', '~> 2.1'
38
+ s.add_development_dependency 'regexp_property_values', '~> 1.0'
31
39
  s.add_development_dependency 'rspec', '~> 3.8'
40
+ if RUBY_VERSION.to_f >= 2.7
41
+ s.add_development_dependency 'codecov', '~> 0.2.12'
42
+ s.add_development_dependency 'gouteur', '~> 1.0.0'
43
+ s.add_development_dependency 'rubocop', '~> 1.8'
44
+ end
32
45
  end