character_set 1.1.1 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa3f78dd78741bc520526d54554d8294cfb81283670b8b975c519c6197dfa6f8
4
- data.tar.gz: 1617d3f9133688337ec7490fb0a7db3797609dc9fe9f3ea5c6bfa680310e766d
3
+ metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
4
+ data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
5
5
  SHA512:
6
- metadata.gz: 816d5ac8bd2459a4c9080a3b1c3409f8de17c5e9847a196b01cbf2b5b4d753554a5d9fb78a891ee6bed97df92d217cc6ee230bb4f595e5ead569ee80a7385f3d
7
- data.tar.gz: afd506628f34b4dadfd375e73ae23af69cd59f6423f77139357c9e71df1c7dd852b32c28e998ee38d9bc4ce2e7b726863ee41039ceede3c1f3f0058aad6e1f39
6
+ metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
7
+ data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 2.7
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ matrix:
11
+ ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
12
+
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ - name: Set up Ruby ${{ matrix.ruby }}
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: ${{ matrix.ruby }}
19
+ - name: Install dependencies
20
+ run: bundle install --jobs 4
21
+ - name: Test with Rake
22
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .vscode
18
19
  bbin/
19
20
  binstubs/*
20
21
  bundler_stubs/*/.yardoc
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop
@@ -1,50 +1,86 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 9472902.2 i/s
7
+ String#count: 2221799.9 i/s - 4.26x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 12388427.2 i/s
13
+ Regexp#match?: 7901676.8 i/s - 1.57x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 12263689.1 i/s
19
+ Regexp#match?: 4940889.9 i/s - 2.48x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2406722.6 i/s
25
+ String#gsub: 235760.3 i/s - 10.21x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1653607.6 i/s
31
+ String#gsub: 272782.9 i/s - 6.06x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2671038.2 i/s
37
+ String#gsub: 242551.0 i/s - 11.01x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1726496.5 i/s
43
+ String#gsub: 215609.2 i/s - 8.01x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2373856.1 i/s
49
+ String#scan: 480000.5 i/s - 4.95x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 11988328.7 i/s
55
+ Regexp#match?: 6758146.8 i/s - 1.77x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
47
59
 
48
- CharacterSet#used_by?: 246527.7 i/s
49
- Regexp#match?: 92956.5 i/s - 2.65x slower
60
+ CharacterSet#used_by?: 288223.3 i/s
61
+ Regexp#match?: 102384.2 i/s - 2.82x slower
62
+ ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 2538251.2 i/s
67
+ SortedSet#add: 443925.9 i/s - 5.72x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 2487620.8 i/s
73
+ SortedSet#delete: 628816.1 i/s - 3.96x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 551.6 i/s
79
+ SortedSet#merge: 1.4 i/s - 393.59x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 636890.7 i/s
85
+ SortedSet#minmax: 254.1 i/s - 2506.20x slower
50
86
  ```
@@ -4,6 +4,53 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.4.1] - 2020-01-10
8
+
9
+ ### Fixed
10
+ - multiple fixes for Ruby 3
11
+ - fixed segfault for some `String` manipulation cases
12
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
13
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
14
+
15
+ ## [1.4.0] - 2019-06-07
16
+
17
+ ### Added
18
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
19
+ - allows for much shorter astral plane representations e.g. in JavaScript
20
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
21
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
22
+
23
+ ### Fixed
24
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
25
+
26
+ ## [1.3.0] - 2019-04-26
27
+
28
+ ### Added
29
+ - improved `String` manipulation speed
30
+ - improved initialization and `#merge` speed when passing a large `Range`
31
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
32
+ - before, every set instance required 136 KB for codepoints
33
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
34
+ - `#count_in` and `#scan_in` methods for `String` interaction
35
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
36
+ - conversion methods `#assigned_part`, `#valid_part`
37
+ - sectioning methods `#ascii_part`, `#plane(n)`
38
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
39
+
40
+ ### Fixed
41
+ - `#count` now supports passing an argument or block as usual
42
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
43
+
44
+ ## [1.2.0] - 2019-04-02
45
+
46
+ ### Added
47
+ - added latest Unicode casefold data (for `#case_insensitive`)
48
+
49
+ ## [1.1.2] - 2018-09-25
50
+
51
+ ### Fixed
52
+ - restored `range_compressor` as a runtime dependency for JRuby only
53
+
7
54
  ## [1.1.1] - 2018-09-24
8
55
 
9
56
  ### Fixed
data/README.md CHANGED
@@ -1,15 +1,18 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
6
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
7
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+
9
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
10
 
8
11
  Many parts can be used independently, e.g.:
9
12
  - `CharacterSet::Character`
10
13
  - `CharacterSet::Parser`
11
14
  - `CharacterSet::Writer`
12
- - [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
15
+ - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
13
16
 
14
17
  ## Usage
15
18
 
@@ -37,7 +40,7 @@ CharacterSet.parse('[a-c]')
37
40
  CharacterSet.parse('\U00000061-\U00000063')
38
41
  ```
39
42
 
40
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
43
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
41
44
 
42
45
  ```ruby
43
46
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
49
52
 
50
53
  ### Predefined utility sets
51
54
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
55
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
56
 
54
57
  ```ruby
55
58
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
60
63
 
61
64
  ### Interact with Strings
62
65
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
66
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
67
 
65
68
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
69
 
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
74
  ```
72
75
 
73
76
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
77
+
74
78
  ```ruby
75
79
  string = 'Tüür'
76
80
 
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
88
  string # => ''
85
89
  ```
86
90
 
91
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
92
+
93
+ ```ruby
94
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
95
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
96
+ ```
97
+
87
98
  There is also a core extension for String interaction.
88
99
  ```ruby
89
100
  require 'character_set/core_ext/string_ext'
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
100
111
 
101
112
  ### Manipulate
102
113
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
114
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
115
 
105
116
  Where appropriate, methods take both chars and codepoints, e.g.:
106
117
 
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
122
133
 
123
134
  # surrogate pair halves are not included by default
124
135
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
136
+ # => #<CharacterSet (size: 1114112)>
126
137
  ```
127
138
 
128
139
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
140
 
130
141
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
142
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
143
  ```
133
144
 
134
145
  ### Write
@@ -156,20 +167,33 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
156
167
  # disable abbreviation (grouping of codepoints in ranges)
157
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
169
 
159
- # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
161
- ```
170
+ # astral members require some trickery if we want to target environments
171
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
172
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
162
173
 
163
- Note: If you run JRuby or another Ruby without C support, you will also need to install [`range_compressor`](https://github.com/janosch-x/range_compressor) for these writing operations.
174
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
175
+ set.to_s_with_surrogate_ranges
176
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
177
+
178
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
179
+ # don't work in your target environment:
180
+ set.to_s_with_surrogate_alternation
181
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
+ ```
164
183
 
165
184
  ### Unicode plane methods
166
185
 
167
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
186
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
168
187
  ```Ruby
188
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
189
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
190
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
191
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
169
192
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
170
193
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
171
194
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
172
195
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
196
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
173
197
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
174
198
  CharacterSet::Character.new('a').plane # => 0
175
199
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -16,6 +23,8 @@ namespace :java do
16
23
  java_gemspec.platform = 'java'
17
24
  java_gemspec.extensions = []
18
25
 
26
+ java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
27
+
19
28
  Gem::PackageTask.new(java_gemspec) do |pkg|
20
29
  pkg.need_zip = true
21
30
  pkg.need_tar = true
@@ -33,43 +42,62 @@ task :sync_ruby_spec do
33
42
  'CharacterSet' => './spec/ruby-spec/library/character_set',
34
43
  'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
35
44
  }
45
+
46
+ # download fresh specs from ruby/spec repository
36
47
  variants.each do |_, dir|
37
48
  FileUtils.rm_rf(dir) if File.exist?(dir)
38
49
  `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
39
50
  end
40
51
 
52
+ # make copies for each CharacterSet variant
41
53
  base = variants.first[1]
42
54
  variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
43
55
 
44
- variants.each.with_index do |(class_name, dir), i|
56
+ # adapt specs to work with CharacterSet
57
+ variants.each do |class_name, dir|
45
58
  Dir["#{dir}/**/*.rb"].each do |spec|
46
- # remove some tests that do not apply or are covered otherwise
47
- if spec =~ %r{/(flatten|initialize|pretty_print)}
59
+ # ignore some tests that do not apply or are covered otherwise
60
+ if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
48
61
  File.delete(spec)
49
62
  next
50
63
  end
51
64
 
52
- # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
53
- # and `i` added to shared example names or they'll override each other
54
65
  adapted_content =
55
- File
56
- .read(spec)
57
- .gsub('SortedSet', class_name)
58
- .gsub('sorted_set_', "sorted_set_#{i}_")
59
- .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
60
- .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
61
- .gsub('"one"', '1')
62
- .gsub('"two"', '2')
63
- .gsub('"three"', '3')
64
- .gsub('"four"', '4')
65
- .gsub('"five"', '5')
66
- .gsub('@method', 'method')
67
- .gsub(/be_(false|true)/, 'be \1')
68
- .gsub('mock', 'double')
66
+ File.read(spec).
67
+ # adapt class name
68
+ gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
+ gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
+ # get shared specs from a single shared dir at the parent level
71
+ gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
+ # make 'mspec' syntax rspec-compatible
73
+ gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
+ gsub(/be_(false|true)/, 'be \1').
75
+ gsub('stub!', 'stub').
76
+ gsub('mock', 'double').
77
+ gsub('@method', 'method').
78
+ # remove unneeded requires
79
+ gsub(/require 'set'\n/, '').
80
+ gsub(/require.*spec_helper.*\n/, '').
81
+ gsub(/\A\n+/, '').
82
+ # make examples use Integers/codepoints
83
+ gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
+ gsub('"one"', '1').
85
+ gsub('"two"', '2').
86
+ gsub('"three"', '3').
87
+ gsub('"four"', '4').
88
+ gsub('"five"', '5').
89
+ gsub(/x.(size|length) == 3/, 'x != 3').
90
+ gsub(/x.(size|length) != 3/, 'x == 3').
91
+ gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
69
92
 
70
93
  File.open(spec, 'w') { |f| f.puts adapted_content }
71
94
  end
72
95
  end
96
+
97
+ # keep only one copy of the shared specs, at the parent level
98
+ FileUtils.rm_rf(base + '/../shared')
99
+ FileUtils.mv(base + '/shared', base + '/../')
100
+ variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
73
101
  end
74
102
 
75
103
  desc 'Download unicode casefold data and write new C header file'
@@ -85,26 +113,22 @@ task :sync_casefold_data do
85
113
  hash[from] = to if type == 'C'
86
114
  end.sort
87
115
 
88
- File.open(dst_path, 'w') do |f|
89
- f.puts <<-C
90
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
91
-
92
- typedef struct casefold_mapping {
93
- unsigned long from;
94
- unsigned long to;
95
- } casefold_mapping;
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
96
119
 
97
- #define CASEFOLD_COUNT #{mapping.size}
98
-
99
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
100
- C
101
-
102
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
103
123
 
104
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
105
131
  end
106
-
107
- File.unlink(src_path)
108
132
  end
109
133
 
110
134
  desc 'Run all IPS benchmarks'