character_set 1.1.1 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa3f78dd78741bc520526d54554d8294cfb81283670b8b975c519c6197dfa6f8
4
- data.tar.gz: 1617d3f9133688337ec7490fb0a7db3797609dc9fe9f3ea5c6bfa680310e766d
3
+ metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
4
+ data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
5
5
  SHA512:
6
- metadata.gz: 816d5ac8bd2459a4c9080a3b1c3409f8de17c5e9847a196b01cbf2b5b4d753554a5d9fb78a891ee6bed97df92d217cc6ee230bb4f595e5ead569ee80a7385f3d
7
- data.tar.gz: afd506628f34b4dadfd375e73ae23af69cd59f6423f77139357c9e71df1c7dd852b32c28e998ee38d9bc4ce2e7b726863ee41039ceede3c1f3f0058aad6e1f39
6
+ metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
7
+ data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
@@ -0,0 +1,29 @@
1
+ # based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
2
+
3
+ name: rubocop linting
4
+
5
+ on: [push, pull_request]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby
14
+ uses: ruby/setup-ruby@v1
15
+ with:
16
+ ruby-version: 2.7
17
+ - name: Cache gems
18
+ uses: actions/cache@v1
19
+ with:
20
+ path: vendor/bundle
21
+ key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
22
+ restore-keys: |
23
+ ${{ runner.os }}-rubocop-
24
+ - name: Install gems
25
+ run: |
26
+ bundle config path vendor/bundle
27
+ bundle install --jobs 4 --retry 3
28
+ - name: Run rubocop
29
+ run: bundle exec rubocop --lint
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ matrix:
11
+ ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
12
+
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ - name: Set up Ruby ${{ matrix.ruby }}
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: ${{ matrix.ruby }}
19
+ - name: Install dependencies
20
+ run: bundle install --jobs 4
21
+ - name: Test with Rake
22
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .vscode
18
19
  bbin/
19
20
  binstubs/*
20
21
  bundler_stubs/*/.yardoc
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ Exclude:
3
+ - '**/doc/*'
4
+ - '**/pkg/*'
5
+ - '**/spec/ruby-spec/**/*'
6
+ - '**/vendor/**/*' # vendored dependencies
7
+ NewCops: enable
8
+ RubyInterpreters:
9
+ - ruby
10
+ - rake
11
+ TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop
@@ -1,50 +1,86 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 9472902.2 i/s
7
+ String#count: 2221799.9 i/s - 4.26x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 12388427.2 i/s
13
+ Regexp#match?: 7901676.8 i/s - 1.57x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 12263689.1 i/s
19
+ Regexp#match?: 4940889.9 i/s - 2.48x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2406722.6 i/s
25
+ String#gsub: 235760.3 i/s - 10.21x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1653607.6 i/s
31
+ String#gsub: 272782.9 i/s - 6.06x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2671038.2 i/s
37
+ String#gsub: 242551.0 i/s - 11.01x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1726496.5 i/s
43
+ String#gsub: 215609.2 i/s - 8.01x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2373856.1 i/s
49
+ String#scan: 480000.5 i/s - 4.95x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 11988328.7 i/s
55
+ Regexp#match?: 6758146.8 i/s - 1.77x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
47
59
 
48
- CharacterSet#used_by?: 246527.7 i/s
49
- Regexp#match?: 92956.5 i/s - 2.65x slower
60
+ CharacterSet#used_by?: 288223.3 i/s
61
+ Regexp#match?: 102384.2 i/s - 2.82x slower
62
+ ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 2538251.2 i/s
67
+ SortedSet#add: 443925.9 i/s - 5.72x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 2487620.8 i/s
73
+ SortedSet#delete: 628816.1 i/s - 3.96x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 551.6 i/s
79
+ SortedSet#merge: 1.4 i/s - 393.59x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 636890.7 i/s
85
+ SortedSet#minmax: 254.1 i/s - 2506.20x slower
50
86
  ```
@@ -4,6 +4,53 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.4.1] - 2020-01-10
8
+
9
+ ### Fixed
10
+ - multiple fixes for Ruby 3
11
+ - fixed segfault for some `String` manipulation cases
12
+ - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
13
+ - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
14
+
15
+ ## [1.4.0] - 2019-06-07
16
+
17
+ ### Added
18
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
19
+ - allows for much shorter astral plane representations e.g. in JavaScript
20
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
21
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
22
+
23
+ ### Fixed
24
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
25
+
26
+ ## [1.3.0] - 2019-04-26
27
+
28
+ ### Added
29
+ - improved `String` manipulation speed
30
+ - improved initialization and `#merge` speed when passing a large `Range`
31
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
32
+ - before, every set instance required 136 KB for codepoints
33
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
34
+ - `#count_in` and `#scan_in` methods for `String` interaction
35
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
36
+ - conversion methods `#assigned_part`, `#valid_part`
37
+ - sectioning methods `#ascii_part`, `#plane(n)`
38
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
39
+
40
+ ### Fixed
41
+ - `#count` now supports passing an argument or block as usual
42
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
43
+
44
+ ## [1.2.0] - 2019-04-02
45
+
46
+ ### Added
47
+ - added latest Unicode casefold data (for `#case_insensitive`)
48
+
49
+ ## [1.1.2] - 2018-09-25
50
+
51
+ ### Fixed
52
+ - restored `range_compressor` as a runtime dependency for JRuby only
53
+
7
54
  ## [1.1.1] - 2018-09-24
8
55
 
9
56
  ### Fixed
data/README.md CHANGED
@@ -1,15 +1,18 @@
1
1
  # CharacterSet
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
- [![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
4
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
6
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
7
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+
9
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
10
 
8
11
  Many parts can be used independently, e.g.:
9
12
  - `CharacterSet::Character`
10
13
  - `CharacterSet::Parser`
11
14
  - `CharacterSet::Writer`
12
- - [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
15
+ - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
13
16
 
14
17
  ## Usage
15
18
 
@@ -37,7 +40,7 @@ CharacterSet.parse('[a-c]')
37
40
  CharacterSet.parse('\U00000061-\U00000063')
38
41
  ```
39
42
 
40
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
43
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
41
44
 
42
45
  ```ruby
43
46
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
49
52
 
50
53
  ### Predefined utility sets
51
54
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
55
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
56
 
54
57
  ```ruby
55
58
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
60
63
 
61
64
  ### Interact with Strings
62
65
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
66
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
67
 
65
68
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
69
 
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
74
  ```
72
75
 
73
76
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
77
+
74
78
  ```ruby
75
79
  string = 'Tüür'
76
80
 
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
88
  string # => ''
85
89
  ```
86
90
 
91
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
92
+
93
+ ```ruby
94
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
95
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
96
+ ```
97
+
87
98
  There is also a core extension for String interaction.
88
99
  ```ruby
89
100
  require 'character_set/core_ext/string_ext'
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
100
111
 
101
112
  ### Manipulate
102
113
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
114
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
115
 
105
116
  Where appropriate, methods take both chars and codepoints, e.g.:
106
117
 
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
122
133
 
123
134
  # surrogate pair halves are not included by default
124
135
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
136
+ # => #<CharacterSet (size: 1114112)>
126
137
  ```
127
138
 
128
139
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
140
 
130
141
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
142
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
143
  ```
133
144
 
134
145
  ### Write
@@ -156,20 +167,33 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
156
167
  # disable abbreviation (grouping of codepoints in ranges)
157
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
169
 
159
- # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
161
- ```
170
+ # astral members require some trickery if we want to target environments
171
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
172
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
162
173
 
163
- Note: If you run JRuby or another Ruby without C support, you will also need to install [`range_compressor`](https://github.com/janosch-x/range_compressor) for these writing operations.
174
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
175
+ set.to_s_with_surrogate_ranges
176
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
177
+
178
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
179
+ # don't work in your target environment:
180
+ set.to_s_with_surrogate_alternation
181
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
+ ```
164
183
 
165
184
  ### Unicode plane methods
166
185
 
167
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
186
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
168
187
  ```Ruby
188
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
189
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
190
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
191
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
169
192
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
170
193
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
171
194
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
172
195
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
196
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
173
197
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
174
198
  CharacterSet::Character.new('a').plane # => 0
175
199
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -16,6 +23,8 @@ namespace :java do
16
23
  java_gemspec.platform = 'java'
17
24
  java_gemspec.extensions = []
18
25
 
26
+ java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
27
+
19
28
  Gem::PackageTask.new(java_gemspec) do |pkg|
20
29
  pkg.need_zip = true
21
30
  pkg.need_tar = true
@@ -33,43 +42,62 @@ task :sync_ruby_spec do
33
42
  'CharacterSet' => './spec/ruby-spec/library/character_set',
34
43
  'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
35
44
  }
45
+
46
+ # download fresh specs from ruby/spec repository
36
47
  variants.each do |_, dir|
37
48
  FileUtils.rm_rf(dir) if File.exist?(dir)
38
49
  `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
39
50
  end
40
51
 
52
+ # make copies for each CharacterSet variant
41
53
  base = variants.first[1]
42
54
  variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
43
55
 
44
- variants.each.with_index do |(class_name, dir), i|
56
+ # adapt specs to work with CharacterSet
57
+ variants.each do |class_name, dir|
45
58
  Dir["#{dir}/**/*.rb"].each do |spec|
46
- # remove some tests that do not apply or are covered otherwise
47
- if spec =~ %r{/(flatten|initialize|pretty_print)}
59
+ # ignore some tests that do not apply or are covered otherwise
60
+ if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
48
61
  File.delete(spec)
49
62
  next
50
63
  end
51
64
 
52
- # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
53
- # and `i` added to shared example names or they'll override each other
54
65
  adapted_content =
55
- File
56
- .read(spec)
57
- .gsub('SortedSet', class_name)
58
- .gsub('sorted_set_', "sorted_set_#{i}_")
59
- .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
60
- .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
61
- .gsub('"one"', '1')
62
- .gsub('"two"', '2')
63
- .gsub('"three"', '3')
64
- .gsub('"four"', '4')
65
- .gsub('"five"', '5')
66
- .gsub('@method', 'method')
67
- .gsub(/be_(false|true)/, 'be \1')
68
- .gsub('mock', 'double')
66
+ File.read(spec).
67
+ # adapt class name
68
+ gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
+ gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
+ # get shared specs from a single shared dir at the parent level
71
+ gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
+ # make 'mspec' syntax rspec-compatible
73
+ gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
+ gsub(/be_(false|true)/, 'be \1').
75
+ gsub('stub!', 'stub').
76
+ gsub('mock', 'double').
77
+ gsub('@method', 'method').
78
+ # remove unneeded requires
79
+ gsub(/require 'set'\n/, '').
80
+ gsub(/require.*spec_helper.*\n/, '').
81
+ gsub(/\A\n+/, '').
82
+ # make examples use Integers/codepoints
83
+ gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
+ gsub('"one"', '1').
85
+ gsub('"two"', '2').
86
+ gsub('"three"', '3').
87
+ gsub('"four"', '4').
88
+ gsub('"five"', '5').
89
+ gsub(/x.(size|length) == 3/, 'x != 3').
90
+ gsub(/x.(size|length) != 3/, 'x == 3').
91
+ gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
69
92
 
70
93
  File.open(spec, 'w') { |f| f.puts adapted_content }
71
94
  end
72
95
  end
96
+
97
+ # keep only one copy of the shared specs, at the parent level
98
+ FileUtils.rm_rf(base + '/../shared')
99
+ FileUtils.mv(base + '/shared', base + '/../')
100
+ variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
73
101
  end
74
102
 
75
103
  desc 'Download unicode casefold data and write new C header file'
@@ -85,26 +113,22 @@ task :sync_casefold_data do
85
113
  hash[from] = to if type == 'C'
86
114
  end.sort
87
115
 
88
- File.open(dst_path, 'w') do |f|
89
- f.puts <<-C
90
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
91
-
92
- typedef struct casefold_mapping {
93
- unsigned long from;
94
- unsigned long to;
95
- } casefold_mapping;
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
96
119
 
97
- #define CASEFOLD_COUNT #{mapping.size}
98
-
99
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
100
- C
101
-
102
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
103
123
 
104
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
105
131
  end
106
-
107
- File.unlink(src_path)
108
132
  end
109
133
 
110
134
  desc 'Run all IPS benchmarks'