character_set 1.4.0 → 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +1 -1
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +28 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +20 -0
  9. data/BENCHMARK.md +35 -31
  10. data/CHANGELOG.md +64 -1
  11. data/Gemfile +15 -0
  12. data/LICENSE.txt +1 -1
  13. data/README.md +25 -9
  14. data/Rakefile +2 -120
  15. data/character_set.gemspec +0 -10
  16. data/ext/character_set/character_set.c +123 -121
  17. data/ext/character_set/unicode_casefold_table.h +44 -1
  18. data/lib/character_set/core_ext/regexp_ext.rb +9 -1
  19. data/lib/character_set/core_ext/string_ext.rb +2 -2
  20. data/lib/character_set/expression_converter.rb +40 -56
  21. data/lib/character_set/parser.rb +8 -4
  22. data/lib/character_set/predefined_sets/assigned.cps +110 -78
  23. data/lib/character_set/predefined_sets/emoji.cps +16 -14
  24. data/lib/character_set/predefined_sets.rb +11 -0
  25. data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
  26. data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
  27. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  28. data/lib/character_set/ruby_fallback.rb +18 -2
  29. data/lib/character_set/set_method_adapters.rb +4 -3
  30. data/lib/character_set/shared_methods.rb +25 -11
  31. data/lib/character_set/version.rb +1 -1
  32. data/tasks/benchmark.rake +20 -0
  33. data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
  34. data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
  35. data/tasks/benchmarks/shared.rb +28 -0
  36. data/tasks/sync_casefold_data.rake +20 -0
  37. data/tasks/sync_predefined_sets.rake +9 -0
  38. data/tasks/sync_ruby_spec.rake +65 -0
  39. metadata +29 -146
  40. data/.travis.yml +0 -9
  41. data/benchmarks/shared.rb +0 -26
  42. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  43. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  44. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  45. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  46. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  47. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  48. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  49. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2b7f1c6191498176e1a407177b58b54483f5da42070a455bc79cdf93c5aee39c
4
- data.tar.gz: f7852ffc6bf8198d8806f737a3bfab7471543b741b2e73b0d2b6a0184e2023c2
3
+ metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
4
+ data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
5
5
  SHA512:
6
- metadata.gz: 189024c25b3297b7beee9c2d32c6275c5d8e14f62fe828c1255e5c070a34c390d5a644f98607dac913120bb4d46abe143b98a2a7f2e422b876aaffa69958993e
7
- data.tar.gz: 2d552dfa89213e2ec7479128480cc9fdc5210116fa5fed9a42cf9d020a9bea774f2530f3bd64021615280fbfa16b18c08aab87a3e28a6328a7437cfa1e30b731
6
+ metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
7
+ data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
data/.gitattributes CHANGED
@@ -1,3 +1,3 @@
1
1
  *.cps linguist-detectable=false
2
2
  benchmarks/* linguist-detectable=false
3
- spec/ruby-spec/* linguist-vendored
3
+ spec/* linguist-detectable=false
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 3.3
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 3.3
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,28 @@
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ schedule:
7
+ - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
8
+
9
+ jobs:
10
+ build:
11
+ runs-on: ubuntu-latest
12
+
13
+ strategy:
14
+ matrix:
15
+ ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+ - name: Set up Ruby ${{ matrix.ruby }}
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby }}
23
+ - name: Install dependencies
24
+ run: bundle install --jobs 4
25
+ - name: Test with Rake
26
+ run: bundle exec rake
27
+ - uses: codecov/codecov-action@v3
28
+ if: matrix.ruby == '3.2'
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml ADDED
@@ -0,0 +1,20 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
18
+
19
+ Metrics:
20
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,86 +1,90 @@
1
- Results of `rake:benchmark` on ruby 2.6.2p47 (2019-03-13 revision 67232) [x86_64-darwin18]
1
+ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
2
2
 
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 12253693.8 i/s
7
- String#count: 1737741.7 i/s - 7.05x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 14058351.9 i/s
13
- Regexp#match?: 7907608.1 i/s - 1.78x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 13341301.6 i/s
19
- Regexp#match?: 5187453.3 i/s - 2.57x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
- Removing whitespace
22
+ Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 2523184.0 i/s
25
- String#gsub: 225804.7 i/s - 11.17x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
26
27
  ```
27
28
  ```
28
29
  Removing whitespace, emoji and umlauts
29
30
 
30
- CharacterSet#delete_in: 1712208.6 i/s
31
- String#gsub: 278508.8 i/s - 6.15x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
32
34
  ```
33
35
  ```
34
36
  Removing non-whitespace
35
37
 
36
- CharacterSet#keep_in: 2760158.1 i/s
37
- String#gsub: 232797.7 i/s - 11.86x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
38
41
  ```
39
42
  ```
40
- Extracting emoji
43
+ Keeping only emoji
41
44
 
42
- CharacterSet#keep_in: 1775758.8 i/s
43
- String#gsub: 217649.9 i/s - 8.16x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
44
48
  ```
45
49
  ```
46
50
  Extracting emoji to an Array
47
51
 
48
- CharacterSet#scan: 2579030.8 i/s
49
- String#scan: 545107.0 i/s - 4.73x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
50
54
  ```
51
55
  ```
52
56
  Detecting whitespace
53
57
 
54
- CharacterSet#used_by?: 13847689.0 i/s
55
- Regexp#match?: 7533275.2 i/s - 1.84x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
56
60
  ```
57
61
  ```
58
62
  Detecting emoji in a large string
59
63
 
60
- CharacterSet#used_by?: 246527.7 i/s
61
- Regexp#match?: 92956.5 i/s - 2.65x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
62
66
  ```
63
67
  ```
64
68
  Adding entries
65
69
 
66
- CharacterSet#add: 3102081.7 i/s
67
- SortedSet#add: 1897464.8 i/s - 1.63x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
68
72
  ```
69
73
  ```
70
74
  Removing entries
71
75
 
72
- CharacterSet#delete: 3240924.1 i/s
73
- SortedSet#delete: 2887493.9 i/s - 1.12x slower
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
74
78
  ```
75
79
  ```
76
80
  Merging entries
77
81
 
78
- CharacterSet#merge: 536.8 i/s
79
- SortedSet#merge: 12.5 i/s - 42.78x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
80
84
  ```
81
85
  ```
82
86
  Getting the min and max
83
87
 
84
- CharacterSet#minmax: 4111960.8 i/s
85
- SortedSet#minmax: 756.4 i/s - 5436.39x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
86
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,69 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.8.0] - 2024-01-07
10
+
11
+ ### Added
12
+
13
+ - support for `#<=>` and `#join`, which were added to `set` in the meantime
14
+ - support for getting the (overall) character set of a Regexp with multiple expressions
15
+ - support for global and local case-insensitivity in Regexp inputs
16
+ - `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
17
+
18
+ ## [1.7.0] - 2023-05-12
19
+
20
+ ### Added
21
+
22
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
23
+
24
+ ### Fixed
25
+
26
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
27
+ - removed dependency on `set` and `sorted_set`
28
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
29
+
30
+ ## [1.6.0] - 2022-02-16
31
+
32
+ ### Added
33
+
34
+ - `::of` now supports both `String` and `Regexp` arguments
35
+
36
+ ### Fixed
37
+
38
+ - fixed segfault during `String` manipulation on Ruby 3.2.0-dev
39
+ - improved performance for `String` manipulation
40
+ - allow usage in Ractors
41
+ - predefined sets must be pre-initialized for this, though
42
+ - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
43
+ - call them once in the main Ractor to trigger initialization
44
+
45
+ ## [1.5.0] - 2021-12-05
46
+
47
+ ### Added
48
+
49
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
50
+ - latest unicode case-folding data (for `#case_insensitive`)
51
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
52
+ - this matches recent broadening of these methods in `ruby/set`
53
+ - new instance method `#secure_token` (see README)
54
+ - class method `::of` now accepts more than one `String`
55
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
56
+
57
+ ### Fixed
58
+
59
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
60
+ - it used to return a regular `CharacterSet`
61
+
62
+ ## [1.4.1] - 2020-01-10
63
+
64
+ ### Fixed
65
+ - multiple fixes for Ruby 3
66
+ - fixed segfault for some `String` manipulation cases
67
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
68
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
69
+
7
70
  ## [1.4.0] - 2019-06-07
8
71
 
9
72
  ### Added
@@ -23,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
23
86
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
24
87
  - before, every set instance required 136 KB for codepoints
25
88
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
26
- - `#count_in` and `#scan_in` methods for `String` interaction
89
+ - `#count_in` and `#scan` methods for `String` interaction
27
90
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
28
91
  - conversion methods `#assigned_part`, `#valid_part`
29
92
  - sectioning methods `#ascii_part`, `#plane(n)`
data/Gemfile CHANGED
@@ -4,3 +4,18 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
5
  # Specify your gem's dependencies in character_set.gemspec
6
6
  gemspec
7
+
8
+ gem 'benchmark-ips', '~> 2.7'
9
+ gem 'get_process_mem', '~> 0.2.3'
10
+ gem 'rake', '~> 13.1'
11
+ gem 'rake-compiler', '~> 1.1'
12
+ gem 'range_compressor', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.9'
14
+ gem 'regexp_property_values', '~> 1.5'
15
+ gem 'rspec', '~> 3.8'
16
+ gem 'warning', '~> 1.3'
17
+ if RUBY_VERSION.to_f >= 3.0
18
+ gem 'gouteur', '~> 1.0.0'
19
+ gem 'rubocop', '~> 1.59'
20
+ gem 'simplecov-cobertura', require: false
21
+ end
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2018 Janosch Müller
3
+ Copyright (c) 2018-2023 Janosch Müller
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -1,18 +1,21 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
5
- [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
6
+ [![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
6
7
 
7
- This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints.
8
9
 
9
- It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
10
+ It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
11
+
12
+ It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
10
13
 
11
14
  Many parts can be used independently, e.g.:
12
15
  - `CharacterSet::Character`
16
+ - `CharacterSet::ExpressionConverter`
13
17
  - `CharacterSet::Parser`
14
18
  - `CharacterSet::Writer`
15
- - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
16
19
 
17
20
  ## Usage
18
21
 
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
40
43
  CharacterSet.parse('\U00000061-\U00000063')
41
44
  ```
42
45
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
44
47
 
45
48
  ```ruby
49
+ CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
46
50
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
47
51
 
48
52
  require 'character_set/core_ext/regexp_ext'
@@ -92,7 +96,7 @@ string # => ''
92
96
 
93
97
  ```ruby
94
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
95
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
96
100
  ```
97
101
 
98
102
  There is also a core extension for String interaction.
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
143
147
  ```
144
148
 
145
149
  ### Write
150
+
146
151
  ```ruby
147
152
  set = CharacterSet['a', 'b', 'c', 'j', '-']
148
153
 
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
181
186
  # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
187
  ```
183
188
 
184
- ### Unicode plane methods
189
+ ### Other features
190
+
191
+ #### Secure tokens
192
+
193
+ Generate secure random strings of characters from a set:
194
+
195
+ ```ruby
196
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
197
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
198
+ ```
199
+
200
+ #### Unicode planes
185
201
 
186
202
  There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
187
203
  ```Ruby
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
198
214
  CharacterSet::Character.new('a').plane # => 0
199
215
  ```
200
216
 
201
- ### Contributions
217
+ ## Contributions
202
218
 
203
219
  Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,126 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```', caption, '',
151
- result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
152
- end
153
- end
154
- end
155
- end
156
-
157
39
  unless RUBY_PLATFORM =~ /java/
158
40
  # recompile before benchmarking or running specs
159
41
  task(:benchmark).enhance([:compile])
@@ -21,14 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'codecov', '~> 0.1'
27
- s.add_development_dependency 'get_process_mem', '~> 0.2.3'
28
- s.add_development_dependency 'rake', '~> 12.0'
29
- s.add_development_dependency 'rake-compiler', '~> 1.0'
30
- s.add_development_dependency 'range_compressor', '~> 1.0'
31
- s.add_development_dependency 'regexp_parser', '~> 1.3'
32
- s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
33
- s.add_development_dependency 'rspec', '~> 3.8'
34
24
  end