character_set 1.4.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +1 -1
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +28 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +20 -0
  9. data/BENCHMARK.md +35 -31
  10. data/CHANGELOG.md +64 -1
  11. data/Gemfile +15 -0
  12. data/LICENSE.txt +1 -1
  13. data/README.md +25 -9
  14. data/Rakefile +2 -120
  15. data/character_set.gemspec +0 -10
  16. data/ext/character_set/character_set.c +123 -121
  17. data/ext/character_set/unicode_casefold_table.h +44 -1
  18. data/lib/character_set/core_ext/regexp_ext.rb +9 -1
  19. data/lib/character_set/core_ext/string_ext.rb +2 -2
  20. data/lib/character_set/expression_converter.rb +40 -56
  21. data/lib/character_set/parser.rb +8 -4
  22. data/lib/character_set/predefined_sets/assigned.cps +110 -78
  23. data/lib/character_set/predefined_sets/emoji.cps +16 -14
  24. data/lib/character_set/predefined_sets.rb +11 -0
  25. data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
  26. data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
  27. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  28. data/lib/character_set/ruby_fallback.rb +18 -2
  29. data/lib/character_set/set_method_adapters.rb +4 -3
  30. data/lib/character_set/shared_methods.rb +25 -11
  31. data/lib/character_set/version.rb +1 -1
  32. data/tasks/benchmark.rake +20 -0
  33. data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
  34. data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
  35. data/tasks/benchmarks/shared.rb +28 -0
  36. data/tasks/sync_casefold_data.rake +20 -0
  37. data/tasks/sync_predefined_sets.rake +9 -0
  38. data/tasks/sync_ruby_spec.rake +65 -0
  39. metadata +29 -146
  40. data/.travis.yml +0 -9
  41. data/benchmarks/shared.rb +0 -26
  42. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  43. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  44. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  45. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  46. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  47. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  48. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  49. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2b7f1c6191498176e1a407177b58b54483f5da42070a455bc79cdf93c5aee39c
4
- data.tar.gz: f7852ffc6bf8198d8806f737a3bfab7471543b741b2e73b0d2b6a0184e2023c2
3
+ metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
4
+ data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
5
5
  SHA512:
6
- metadata.gz: 189024c25b3297b7beee9c2d32c6275c5d8e14f62fe828c1255e5c070a34c390d5a644f98607dac913120bb4d46abe143b98a2a7f2e422b876aaffa69958993e
7
- data.tar.gz: 2d552dfa89213e2ec7479128480cc9fdc5210116fa5fed9a42cf9d020a9bea774f2530f3bd64021615280fbfa16b18c08aab87a3e28a6328a7437cfa1e30b731
6
+ metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
7
+ data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
data/.gitattributes CHANGED
@@ -1,3 +1,3 @@
1
1
  *.cps linguist-detectable=false
2
2
  benchmarks/* linguist-detectable=false
3
- spec/ruby-spec/* linguist-vendored
3
+ spec/* linguist-detectable=false
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 3.3
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 3.3
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,28 @@
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ schedule:
7
+ - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
8
+
9
+ jobs:
10
+ build:
11
+ runs-on: ubuntu-latest
12
+
13
+ strategy:
14
+ matrix:
15
+ ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+ - name: Set up Ruby ${{ matrix.ruby }}
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby }}
23
+ - name: Install dependencies
24
+ run: bundle install --jobs 4
25
+ - name: Test with Rake
26
+ run: bundle exec rake
27
+ - uses: codecov/codecov-action@v3
28
+ if: matrix.ruby == '3.2'
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml ADDED
@@ -0,0 +1,20 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
18
+
19
+ Metrics:
20
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,86 +1,90 @@
1
- Results of `rake:benchmark` on ruby 2.6.2p47 (2019-03-13 revision 67232) [x86_64-darwin18]
1
+ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
2
2
 
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 12253693.8 i/s
7
- String#count: 1737741.7 i/s - 7.05x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 14058351.9 i/s
13
- Regexp#match?: 7907608.1 i/s - 1.78x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 13341301.6 i/s
19
- Regexp#match?: 5187453.3 i/s - 2.57x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
- Removing whitespace
22
+ Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 2523184.0 i/s
25
- String#gsub: 225804.7 i/s - 11.17x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
26
27
  ```
27
28
  ```
28
29
  Removing whitespace, emoji and umlauts
29
30
 
30
- CharacterSet#delete_in: 1712208.6 i/s
31
- String#gsub: 278508.8 i/s - 6.15x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
32
34
  ```
33
35
  ```
34
36
  Removing non-whitespace
35
37
 
36
- CharacterSet#keep_in: 2760158.1 i/s
37
- String#gsub: 232797.7 i/s - 11.86x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
38
41
  ```
39
42
  ```
40
- Extracting emoji
43
+ Keeping only emoji
41
44
 
42
- CharacterSet#keep_in: 1775758.8 i/s
43
- String#gsub: 217649.9 i/s - 8.16x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
44
48
  ```
45
49
  ```
46
50
  Extracting emoji to an Array
47
51
 
48
- CharacterSet#scan: 2579030.8 i/s
49
- String#scan: 545107.0 i/s - 4.73x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
50
54
  ```
51
55
  ```
52
56
  Detecting whitespace
53
57
 
54
- CharacterSet#used_by?: 13847689.0 i/s
55
- Regexp#match?: 7533275.2 i/s - 1.84x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
56
60
  ```
57
61
  ```
58
62
  Detecting emoji in a large string
59
63
 
60
- CharacterSet#used_by?: 246527.7 i/s
61
- Regexp#match?: 92956.5 i/s - 2.65x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
62
66
  ```
63
67
  ```
64
68
  Adding entries
65
69
 
66
- CharacterSet#add: 3102081.7 i/s
67
- SortedSet#add: 1897464.8 i/s - 1.63x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
68
72
  ```
69
73
  ```
70
74
  Removing entries
71
75
 
72
- CharacterSet#delete: 3240924.1 i/s
73
- SortedSet#delete: 2887493.9 i/s - 1.12x slower
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
74
78
  ```
75
79
  ```
76
80
  Merging entries
77
81
 
78
- CharacterSet#merge: 536.8 i/s
79
- SortedSet#merge: 12.5 i/s - 42.78x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
80
84
  ```
81
85
  ```
82
86
  Getting the min and max
83
87
 
84
- CharacterSet#minmax: 4111960.8 i/s
85
- SortedSet#minmax: 756.4 i/s - 5436.39x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
86
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,69 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.8.0] - 2024-01-07
10
+
11
+ ### Added
12
+
13
+ - support for `#<=>` and `#join`, which were added to `set` in the meantime
14
+ - support for getting the (overall) character set of a Regexp with multiple expressions
15
+ - support for global and local case-insensitivity in Regexp inputs
16
+ - `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
17
+
18
+ ## [1.7.0] - 2023-05-12
19
+
20
+ ### Added
21
+
22
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
23
+
24
+ ### Fixed
25
+
26
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
27
+ - removed dependency on `set` and `sorted_set`
28
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
29
+
30
+ ## [1.6.0] - 2022-02-16
31
+
32
+ ### Added
33
+
34
+ - `::of` now supports both `String` and `Regexp` arguments
35
+
36
+ ### Fixed
37
+
38
+ - fixed segfault during `String` manipulation on Ruby 3.2.0-dev
39
+ - improved performance for `String` manipulation
40
+ - allow usage in Ractors
41
+ - predefined sets must be pre-initialized for this, though
42
+ - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
43
+ - call them once in the main Ractor to trigger initialization
44
+
45
+ ## [1.5.0] - 2021-12-05
46
+
47
+ ### Added
48
+
49
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
50
+ - latest unicode case-folding data (for `#case_insensitive`)
51
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
52
+ - this matches recent broadening of these methods in `ruby/set`
53
+ - new instance method `#secure_token` (see README)
54
+ - class method `::of` now accepts more than one `String`
55
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
56
+
57
+ ### Fixed
58
+
59
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
60
+ - it used to return a regular `CharacterSet`
61
+
62
+ ## [1.4.1] - 2020-01-10
63
+
64
+ ### Fixed
65
+ - multiple fixes for Ruby 3
66
+ - fixed segfault for some `String` manipulation cases
67
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
68
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
69
+
7
70
  ## [1.4.0] - 2019-06-07
8
71
 
9
72
  ### Added
@@ -23,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
23
86
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
24
87
  - before, every set instance required 136 KB for codepoints
25
88
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
26
- - `#count_in` and `#scan_in` methods for `String` interaction
89
+ - `#count_in` and `#scan` methods for `String` interaction
27
90
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
28
91
  - conversion methods `#assigned_part`, `#valid_part`
29
92
  - sectioning methods `#ascii_part`, `#plane(n)`
data/Gemfile CHANGED
@@ -4,3 +4,18 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
5
  # Specify your gem's dependencies in character_set.gemspec
6
6
  gemspec
7
+
8
+ gem 'benchmark-ips', '~> 2.7'
9
+ gem 'get_process_mem', '~> 0.2.3'
10
+ gem 'rake', '~> 13.1'
11
+ gem 'rake-compiler', '~> 1.1'
12
+ gem 'range_compressor', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.9'
14
+ gem 'regexp_property_values', '~> 1.5'
15
+ gem 'rspec', '~> 3.8'
16
+ gem 'warning', '~> 1.3'
17
+ if RUBY_VERSION.to_f >= 3.0
18
+ gem 'gouteur', '~> 1.0.0'
19
+ gem 'rubocop', '~> 1.59'
20
+ gem 'simplecov-cobertura', require: false
21
+ end
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2018 Janosch Müller
3
+ Copyright (c) 2018-2023 Janosch Müller
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -1,18 +1,21 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
5
- [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
+ [![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
6
7
 
7
- This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints.
8
9
 
9
- It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
10
+ It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
11
+
12
+ It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
10
13
 
11
14
  Many parts can be used independently, e.g.:
12
15
  - `CharacterSet::Character`
16
+ - `CharacterSet::ExpressionConverter`
13
17
  - `CharacterSet::Parser`
14
18
  - `CharacterSet::Writer`
15
- - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
16
19
 
17
20
  ## Usage
18
21
 
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
40
43
  CharacterSet.parse('\U00000061-\U00000063')
41
44
  ```
42
45
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
44
47
 
45
48
  ```ruby
49
+ CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
46
50
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
47
51
 
48
52
  require 'character_set/core_ext/regexp_ext'
@@ -92,7 +96,7 @@ string # => ''
92
96
 
93
97
  ```ruby
94
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
95
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
96
100
  ```
97
101
 
98
102
  There is also a core extension for String interaction.
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
143
147
  ```
144
148
 
145
149
  ### Write
150
+
146
151
  ```ruby
147
152
  set = CharacterSet['a', 'b', 'c', 'j', '-']
148
153
 
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
181
186
  # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
187
  ```
183
188
 
184
- ### Unicode plane methods
189
+ ### Other features
190
+
191
+ #### Secure tokens
192
+
193
+ Generate secure random strings of characters from a set:
194
+
195
+ ```ruby
196
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
197
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
198
+ ```
199
+
200
+ #### Unicode planes
185
201
 
186
202
  There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
187
203
  ```Ruby
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
198
214
  CharacterSet::Character.new('a').plane # => 0
199
215
  ```
200
216
 
201
- ### Contributions
217
+ ## Contributions
202
218
 
203
219
  Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,126 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```', caption, '',
151
- result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
152
- end
153
- end
154
- end
155
- end
156
-
157
39
  unless RUBY_PLATFORM =~ /java/
158
40
  # recompile before benchmarking or running specs
159
41
  task(:benchmark).enhance([:compile])
@@ -21,14 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'codecov', '~> 0.1'
27
- s.add_development_dependency 'get_process_mem', '~> 0.2.3'
28
- s.add_development_dependency 'rake', '~> 12.0'
29
- s.add_development_dependency 'rake-compiler', '~> 1.0'
30
- s.add_development_dependency 'range_compressor', '~> 1.0'
31
- s.add_development_dependency 'regexp_parser', '~> 1.3'
32
- s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
33
- s.add_development_dependency 'rspec', '~> 3.8'
34
24
  end