character_set 1.2.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +22 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +17 -0
  9. data/BENCHMARK.md +53 -17
  10. data/CHANGELOG.md +54 -0
  11. data/README.md +51 -12
  12. data/Rakefile +20 -18
  13. data/benchmarks/count_in.rb +13 -0
  14. data/benchmarks/delete_in.rb +1 -1
  15. data/benchmarks/scan.rb +13 -0
  16. data/benchmarks/shared.rb +5 -0
  17. data/benchmarks/z_add.rb +12 -0
  18. data/benchmarks/z_delete.rb +12 -0
  19. data/benchmarks/z_merge.rb +15 -0
  20. data/benchmarks/z_minmax.rb +12 -0
  21. data/bin/console +2 -0
  22. data/character_set.gemspec +17 -4
  23. data/ext/character_set/character_set.c +969 -415
  24. data/ext/character_set/unicode_casefold_table.h +44 -1
  25. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  26. data/lib/character_set/character.rb +1 -1
  27. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  28. data/lib/character_set/core_ext/string_ext.rb +3 -1
  29. data/lib/character_set/expression_converter.rb +41 -43
  30. data/lib/character_set/parser.rb +1 -1
  31. data/lib/character_set/predefined_sets/any.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  33. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  34. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  35. data/lib/character_set/predefined_sets/assigned.cps +677 -0
  36. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  37. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  38. data/lib/character_set/predefined_sets/emoji.cps +152 -0
  39. data/lib/character_set/predefined_sets/newline.cps +3 -0
  40. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  41. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  42. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  43. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  44. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  45. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  46. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  47. data/lib/character_set/predefined_sets.rb +25 -260
  48. data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
  49. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  50. data/lib/character_set/ruby_fallback.rb +5 -3
  51. data/lib/character_set/set_method_adapters.rb +4 -3
  52. data/lib/character_set/shared_methods.rb +69 -50
  53. data/lib/character_set/version.rb +1 -1
  54. data/lib/character_set/writer.rb +98 -27
  55. metadata +114 -17
  56. data/.travis.yml +0 -8
  57. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 434323b3b99246a17ea5e062afd87d3edc3c09927b2231b4409b295ff63c7d6c
4
- data.tar.gz: 174c6dc751b03e49cf87045fad9a48100460244b7d7e25deef27066bd4aef92c
3
+ metadata.gz: 9622bc20bbdb48f8deff84dbed9e800e6bc500a6a08a27e7b3aea2ea651cd278
4
+ data.tar.gz: 5853e8d5be7e9a1963419aa4f9fbc631148fe5bef45aa185b9117d32b44aa959
5
5
  SHA512:
6
- metadata.gz: d9fa059ea3171209af537f0bd7636e3a65b962f30029ca399fe2fa0bd6168dd692b7bc5fb1014590a830b2e9aede9c26ae00ae8fe4a2eae4a86cf95e208b507d
7
- data.tar.gz: 692f4596b6adc9b44879b69fb82e55dc90d107156ecabb96c14ea91b4dc0c7dc706724b42093d0ef762cdac697f05ef855c5f462451015e1d06022ab06bc1c8d
6
+ metadata.gz: 2cc2a60b9388a2e3beef66da20aa8205cc501980a7dc66f2716c66f7e999a083927b27a761e6b932b6d5c16b8e5968f8e04370ecf3c999326f378f60bfa3cedc
7
+ data.tar.gz: a2a8d1f9ac6cdf6302af98662fc3efda4b8c6fe003c7cdc853a61a64f9c7a596b1bbd7a79dca19081b8ce2576f9c3d848869141b164c145e22befaaffec8b265
data/.gitattributes ADDED
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 2.7
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 2.7
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ matrix:
11
+ ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
12
+
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ - name: Set up Ruby ${{ matrix.ruby }}
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: ${{ matrix.ruby }}
19
+ - name: Install dependencies
20
+ run: bundle install --jobs 4
21
+ - name: Test with Rake
22
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml ADDED
@@ -0,0 +1,17 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,50 +1,86 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 9472902.2 i/s
7
+ String#count: 2221799.9 i/s - 4.26x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 12388427.2 i/s
13
+ Regexp#match?: 7901676.8 i/s - 1.57x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 12263689.1 i/s
19
+ Regexp#match?: 4940889.9 i/s - 2.48x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2406722.6 i/s
25
+ String#gsub: 235760.3 i/s - 10.21x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1653607.6 i/s
31
+ String#gsub: 272782.9 i/s - 6.06x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2671038.2 i/s
37
+ String#gsub: 242551.0 i/s - 11.01x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1726496.5 i/s
43
+ String#gsub: 215609.2 i/s - 8.01x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2373856.1 i/s
49
+ String#scan: 480000.5 i/s - 4.95x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 11988328.7 i/s
55
+ Regexp#match?: 6758146.8 i/s - 1.77x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
47
59
 
48
- CharacterSet#used_by?: 246527.7 i/s
49
- Regexp#match?: 92956.5 i/s - 2.65x slower
60
+ CharacterSet#used_by?: 288223.3 i/s
61
+ Regexp#match?: 102384.2 i/s - 2.82x slower
62
+ ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 2538251.2 i/s
67
+ SortedSet#add: 443925.9 i/s - 5.72x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 2487620.8 i/s
73
+ SortedSet#delete: 628816.1 i/s - 3.96x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 551.6 i/s
79
+ SortedSet#merge: 1.4 i/s - 393.59x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 636890.7 i/s
85
+ SortedSet#minmax: 254.1 i/s - 2506.20x slower
50
86
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,60 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.5.0] - 2021-12-05
8
+
9
+ ### Added
10
+
11
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
12
+ - latest unicode case-folding data (for `#case_insensitive`)
13
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
14
+ - this matches recent broadening of these methods in `ruby/set`
15
+ - new instance method `#secure_token` (see README)
16
+ - class method `::of` now accepts more than one `String`
17
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
18
+
19
+ ### Fixed
20
+
21
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
22
+ - it used to return a regular `CharacterSet`
23
+
24
+ ## [1.4.1] - 2020-01-10
25
+
26
+ ### Fixed
27
+ - multiple fixes for Ruby 3
28
+ - fixed segfault for some `String` manipulation cases
29
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
30
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
31
+
32
+ ## [1.4.0] - 2019-06-07
33
+
34
+ ### Added
35
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
36
+ - allows for much shorter astral plane representations e.g. in JavaScript
37
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
38
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
39
+
40
+ ### Fixed
41
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
42
+
43
+ ## [1.3.0] - 2019-04-26
44
+
45
+ ### Added
46
+ - improved `String` manipulation speed
47
+ - improved initialization and `#merge` speed when passing a large `Range`
48
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
49
+ - before, every set instance required 136 KB for codepoints
50
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
51
+ - `#count_in` and `#scan_in` methods for `String` interaction
52
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
53
+ - conversion methods `#assigned_part`, `#valid_part`
54
+ - sectioning methods `#ascii_part`, `#plane(n)`
55
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
56
+
57
+ ### Fixed
58
+ - `#count` now supports passing an argument or block as usual
59
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
60
+
7
61
  ## [1.2.0] - 2019-04-02
8
62
 
9
63
  ### Added
data/README.md CHANGED
@@ -1,12 +1,17 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
7
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
9
+
10
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
11
 
8
12
  Many parts can be used independently, e.g.:
9
13
  - `CharacterSet::Character`
14
+ - `CharacterSet::ExpressionConverter`
10
15
  - `CharacterSet::Parser`
11
16
  - `CharacterSet::Writer`
12
17
  - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
@@ -37,7 +42,7 @@ CharacterSet.parse('[a-c]')
37
42
  CharacterSet.parse('\U00000061-\U00000063')
38
43
  ```
39
44
 
40
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
45
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
41
46
 
42
47
  ```ruby
43
48
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +54,7 @@ require 'character_set/core_ext/regexp_ext'
49
54
 
50
55
  ### Predefined utility sets
51
56
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
57
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
58
 
54
59
  ```ruby
55
60
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +65,7 @@ CharacterSet.non_ascii
60
65
 
61
66
  ### Interact with Strings
62
67
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
68
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
69
 
65
70
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
71
 
@@ -71,6 +76,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
76
  ```
72
77
 
73
78
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
79
+
74
80
  ```ruby
75
81
  string = 'Tüür'
76
82
 
@@ -84,6 +90,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
90
  string # => ''
85
91
  ```
86
92
 
93
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
94
+
95
+ ```ruby
96
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
97
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
98
+ ```
99
+
87
100
  There is also a core extension for String interaction.
88
101
  ```ruby
89
102
  require 'character_set/core_ext/string_ext'
@@ -100,7 +113,7 @@ require 'character_set/core_ext/string_ext'
100
113
 
101
114
  ### Manipulate
102
115
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
116
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
117
 
105
118
  Where appropriate, methods take both chars and codepoints, e.g.:
106
119
 
@@ -122,13 +135,13 @@ non_a.include?('ü') # => true
122
135
 
123
136
  # surrogate pair halves are not included by default
124
137
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
138
+ # => #<CharacterSet (size: 1114112)>
126
139
  ```
127
140
 
128
141
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
142
 
130
143
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
144
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
145
  ```
133
146
 
134
147
  ### Write
@@ -156,18 +169,44 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
156
169
  # disable abbreviation (grouping of codepoints in ranges)
157
170
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
171
 
159
- # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
172
+ # astral members require some trickery if we want to target environments
173
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
174
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
175
+
176
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
177
+ set.to_s_with_surrogate_ranges
178
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
179
+
180
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
181
+ # don't work in your target environment:
182
+ set.to_s_with_surrogate_alternation
183
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
184
+ ```
185
+
186
+ ### Other features
187
+
188
+ #### Secure tokens
189
+
190
+ Generate secure random strings of characters from a set:
191
+
192
+ ```ruby
193
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
194
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
161
195
  ```
162
196
 
163
- ### Unicode plane methods
197
+ #### Unicode planes
164
198
 
165
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
199
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
166
200
  ```Ruby
201
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
202
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
203
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
204
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
167
205
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
168
206
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
169
207
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
170
208
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
209
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
171
210
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
172
211
  CharacterSet::Character.new('a').plane # => 0
173
212
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -106,27 +113,22 @@ task :sync_casefold_data do
106
113
  hash[from] = to if type == 'C'
107
114
  end.sort
108
115
 
109
- File.open(dst_path, 'w') do |f|
110
- f.puts <<-C
111
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
112
- // -*-C-*-
113
-
114
- typedef struct casefold_mapping {
115
- unsigned long from;
116
- unsigned long to;
117
- } casefold_mapping;
118
-
119
- #define CASEFOLD_COUNT #{mapping.size}
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
120
119
 
121
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
122
- C
123
-
124
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
125
123
 
126
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
127
131
  end
128
-
129
- File.unlink(src_path)
130
132
  end
131
133
 
132
134
  desc 'Run all IPS benchmarks'
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
data/benchmarks/shared.rb CHANGED
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
7
+ require 'sorted_set'
8
+ else
9
+ require 'set'
10
+ end
6
11
 
7
12
  def benchmark(caption: nil, cases: {})
8
13
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
data/bin/console CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -22,11 +22,24 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
+ # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
+ # This dependency is only used if the C extension is unavailable.
27
+ # JRuby has it in the stdlib.
28
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
+ s.add_dependency 'sorted_set', '~> 1.0'
30
+ end
31
+
25
32
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'rake', '~> 12.0'
27
- s.add_development_dependency 'rake-compiler', '~> 1.0'
33
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
+ s.add_development_dependency 'rake', '~> 13.0'
35
+ s.add_development_dependency 'rake-compiler', '~> 1.1'
28
36
  s.add_development_dependency 'range_compressor', '~> 1.0'
29
- s.add_development_dependency 'regexp_parser', '~> 1.3'
30
- s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
37
+ s.add_development_dependency 'regexp_parser', '~> 2.1'
38
+ s.add_development_dependency 'regexp_property_values', '~> 1.0'
31
39
  s.add_development_dependency 'rspec', '~> 3.8'
40
+ if RUBY_VERSION.to_f >= 2.7
41
+ s.add_development_dependency 'codecov', '~> 0.2.12'
42
+ s.add_development_dependency 'gouteur', '~> 1.0.0'
43
+ s.add_development_dependency 'rubocop', '~> 1.8'
44
+ end
32
45
  end