character_set 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d2e4067480e00d5d03db2bbd1ee4f222f936e0f2
|
4
|
+
data.tar.gz: 0e4c0bc6cf393b1a81dc368ee86f94d0dea10a82
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d9150168393512190a496ed10af91a1eaa49eb2a01d3fb623de9586eb4fbd354dfea172bf6174ab180f6620ae6ca13a01f94ec26a95fbf118f48f611b4d7acd7
|
7
|
+
data.tar.gz: cb4b067fae5c8a550267a0dcef7708b30d36598b2ed18981711ad9b4a67b23cbf444270f7006d160e50f151ba32fe3402108429d415f7adbfb0be9160fedfda7
|
data/.gitignore
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
*.bundle
|
2
|
+
*.gem
|
3
|
+
*.iml
|
4
|
+
*.stTheme.cache
|
5
|
+
*.sublime-project
|
6
|
+
*.sublime-workspace
|
7
|
+
*.swp
|
8
|
+
*.tmlanguage.cache
|
9
|
+
*.tmPreferences.cache
|
10
|
+
*~
|
11
|
+
.byebug_history
|
12
|
+
.DS_Store
|
13
|
+
.idea/
|
14
|
+
.ruby-gemset
|
15
|
+
.ruby-version
|
16
|
+
.tags
|
17
|
+
.tags1
|
18
|
+
bbin/
|
19
|
+
binstubs/*
|
20
|
+
bundler_stubs/*/.yardoc
|
21
|
+
Gemfile.lock
|
22
|
+
/.bundle/
|
23
|
+
/_yardoc/
|
24
|
+
/coverage/
|
25
|
+
/doc/
|
26
|
+
/pkg/
|
27
|
+
/spec/reports/
|
28
|
+
/tmp/
|
29
|
+
|
30
|
+
# rspec failure tracking
|
31
|
+
.rspec_status
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/BENCHMARK.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
|
2
|
+
|
3
|
+
```
|
4
|
+
Detecting non-whitespace
|
5
|
+
|
6
|
+
CharacterSet#cover?: 13244577.7 i/s
|
7
|
+
Regexp#match?: 8027017.5 i/s - 1.65x slower
|
8
|
+
```
|
9
|
+
```
|
10
|
+
Detecting non-letters
|
11
|
+
|
12
|
+
CharacterSet#cover?: 13082940.8 i/s
|
13
|
+
Regexp#match?: 5372589.2 i/s - 2.44x slower
|
14
|
+
```
|
15
|
+
```
|
16
|
+
Removing whitespace
|
17
|
+
|
18
|
+
CharacterSet#delete_in: 389315.6 i/s
|
19
|
+
String#gsub: 223773.5 i/s - 1.74x slower
|
20
|
+
```
|
21
|
+
```
|
22
|
+
Removing whitespace, emoji and umlauts
|
23
|
+
|
24
|
+
CharacterSet#delete_in: 470239.3 i/s
|
25
|
+
String#gsub: 278679.4 i/s - 1.69x slower
|
26
|
+
```
|
27
|
+
```
|
28
|
+
Removing non-whitespace
|
29
|
+
|
30
|
+
CharacterSet#keep_in: 1138461.0 i/s
|
31
|
+
String#gsub: 235287.4 i/s - 4.84x slower
|
32
|
+
```
|
33
|
+
```
|
34
|
+
Extracting emoji
|
35
|
+
|
36
|
+
CharacterSet#keep_in: 1474472.0 i/s
|
37
|
+
String#gsub: 212269.6 i/s - 6.95x slower
|
38
|
+
```
|
39
|
+
```
|
40
|
+
Detecting whitespace
|
41
|
+
|
42
|
+
CharacterSet#used_by?: 13063108.7 i/s
|
43
|
+
Regexp#match?: 7215075.0 i/s - 1.81x slower
|
44
|
+
```
|
45
|
+
```
|
46
|
+
Detecting emoji in a large string
|
47
|
+
|
48
|
+
CharacterSet#used_by?: 246527.7 i/s
|
49
|
+
Regexp#match?: 92956.5 i/s - 2.65x slower
|
50
|
+
```
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2018 Janosch Müller
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
# CharacterSet
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
|
4
|
+
[![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
|
5
|
+
|
6
|
+
A gem to build, read, write and compare sets of Unicode codepoints.
|
7
|
+
|
8
|
+
Many parts can be used independently, e.g.:
|
9
|
+
- `CharacterSet::Character`
|
10
|
+
- `CharacterSet::Parser`
|
11
|
+
- `CharacterSet::Writer`
|
12
|
+
- [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
### Parse/Initialize
|
17
|
+
|
18
|
+
These all produce a `CharacterSet` containing `a`, `b` and `c`:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
CharacterSet['a', 'b', 'c']
|
22
|
+
CharacterSet[97, 98, 99]
|
23
|
+
CharacterSet.new('a'..'c')
|
24
|
+
CharacterSet.new(0x61..0x63)
|
25
|
+
CharacterSet.of('abacababa')
|
26
|
+
CharacterSet.parse('[a-c]')
|
27
|
+
CharacterSet.parse('\U00000061-\U00000063')
|
28
|
+
```
|
29
|
+
|
30
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
# are there any non-digit ascii chars classified as emoji?
|
34
|
+
set = CharacterSet.of_regexp(/[\D&&[:ascii:]&&\p{emoji}]/)
|
35
|
+
|
36
|
+
# ... of course there are!
|
37
|
+
set.to_a(stringify: true) # => ["#", "*"]
|
38
|
+
|
39
|
+
# with the core extension:
|
40
|
+
require 'character_set/core_ext/regexp_ext'
|
41
|
+
/[a-e&&[^c]]/.character_set # => CharacterSet['a', 'b', 'd', 'e']
|
42
|
+
```
|
43
|
+
|
44
|
+
### Common utility sets
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
CharacterSet.ascii
|
48
|
+
CharacterSet.bmp
|
49
|
+
CharacterSet.crypt
|
50
|
+
CharacterSet.emoji
|
51
|
+
CharacterSet.newline
|
52
|
+
CharacterSet.unicode
|
53
|
+
CharacterSet.url_fragment
|
54
|
+
CharacterSet.url_host
|
55
|
+
CharacterSet.url_path
|
56
|
+
CharacterSet.url_query
|
57
|
+
CharacterSet.whitespace
|
58
|
+
|
59
|
+
# e.g.
|
60
|
+
CharacterSet.url_query.cover?('?a=(b$c;)') # => true
|
61
|
+
CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"]
|
62
|
+
|
63
|
+
# all can be prefixed with `non_`, e.g.
|
64
|
+
CharacterSet.non_ascii.delete_in(string)
|
65
|
+
```
|
66
|
+
|
67
|
+
### Interact with Strings
|
68
|
+
|
69
|
+
CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
|
70
|
+
|
71
|
+
`#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
CharacterSet.ascii.used_by?('Tüür') # => true
|
75
|
+
CharacterSet.ascii.cover?('Tüür') # => false
|
76
|
+
CharacterSet.ascii.cover?('Tr') # => true
|
77
|
+
```
|
78
|
+
|
79
|
+
`#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
|
80
|
+
```ruby
|
81
|
+
string = 'Tüür'
|
82
|
+
|
83
|
+
CharacterSet.ascii.delete_in(string) # => 'üü'
|
84
|
+
CharacterSet.ascii.keep_in(string) # => 'Tr'
|
85
|
+
string # => 'Tüür'
|
86
|
+
|
87
|
+
CharacterSet.ascii.delete_in!(string) # => 'üü'
|
88
|
+
string # => 'üü'
|
89
|
+
CharacterSet.ascii.keep_in!(string) # => ''
|
90
|
+
string # => ''
|
91
|
+
```
|
92
|
+
|
93
|
+
There is also a core extension for String interaction.
|
94
|
+
```ruby
|
95
|
+
require 'character_set/core_ext/string_ext'
|
96
|
+
|
97
|
+
"a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"]
|
98
|
+
"a\rb".uses_character_set?(CharacterSet.emoji) # => false
|
99
|
+
"a\rb".covered_by_character_set?(CharacterSet.newline) # => false
|
100
|
+
"a\rb".delete_character_set(CharacterSet.newline) # => 'ab'
|
101
|
+
# etc.
|
102
|
+
```
|
103
|
+
|
104
|
+
### Manipulate
|
105
|
+
|
106
|
+
Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
|
107
|
+
|
108
|
+
Where appropriate, methods take both chars and codepoints, e.g.:
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
CharacterSet['a'].add('b') # => CharacterSet['a', 'b']
|
112
|
+
CharacterSet['a'].add(98) # => CharacterSet['a', 'b']
|
113
|
+
CharacterSet['a'].include?('a') # => true
|
114
|
+
CharacterSet['a'].include?(0x61) # => true
|
115
|
+
```
|
116
|
+
|
117
|
+
`#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
non_a = CharacterSet['a'].inversion
|
121
|
+
# => #<CharacterSet (size: 1112063)>
|
122
|
+
|
123
|
+
non_a.include?('a') # => false
|
124
|
+
non_a.include?('ü') # => true
|
125
|
+
|
126
|
+
# surrogate pair halves are not included by default
|
127
|
+
CharacterSet['a'].inversion(include_surrogates: true)
|
128
|
+
# => #<CharacterSet (size: 1114111)>
|
129
|
+
```
|
130
|
+
|
131
|
+
`#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
|
132
|
+
|
133
|
+
```ruby
|
134
|
+
CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
135
|
+
```
|
136
|
+
|
137
|
+
### Write
|
138
|
+
```ruby
|
139
|
+
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
140
|
+
|
141
|
+
# safely printable ASCII chars are not escaped by default
|
142
|
+
set.to_s # => 'a-cj\x2D'
|
143
|
+
set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D'
|
144
|
+
|
145
|
+
# brackets may be added
|
146
|
+
set.to_s(in_brackets: true) # => '[a-cj\x2D]'
|
147
|
+
|
148
|
+
# the default escape format is Ruby/ES6 compatible, others are available
|
149
|
+
set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩']
|
150
|
+
set.to_s # => 'a-c\u0258\u{1F929}'
|
151
|
+
set.to_s(format: 'U+') # => 'a-cU+0258U+1F929'
|
152
|
+
set.to_s(format: 'Python') # => "a-c\u0258\U0001F929"
|
153
|
+
set.to_s(format: 'raw') # => 'a-cɘ🤩'
|
154
|
+
|
155
|
+
# or pass a block
|
156
|
+
set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]"
|
157
|
+
set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
158
|
+
|
159
|
+
# disable abbreviation (grouping of codepoints in ranges)
|
160
|
+
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
161
|
+
|
162
|
+
# for full js regex compatibility in case of astral members:
|
163
|
+
set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
|
164
|
+
```
|
165
|
+
|
166
|
+
### Unicode plane methods
|
167
|
+
|
168
|
+
There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
169
|
+
```Ruby
|
170
|
+
CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
|
171
|
+
CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
|
172
|
+
CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
|
173
|
+
CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
|
174
|
+
CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
175
|
+
CharacterSet::Character.new('a').plane # => 0
|
176
|
+
```
|
177
|
+
|
178
|
+
### Contributions
|
179
|
+
|
180
|
+
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
|
+
require 'rubygems/package_task'
|
4
|
+
require 'rake/extensiontask'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
7
|
+
|
8
|
+
task default: :spec
|
9
|
+
|
10
|
+
Rake::ExtensionTask.new('character_set') do |ext|
|
11
|
+
ext.lib_dir = 'lib/character_set'
|
12
|
+
end
|
13
|
+
|
14
|
+
namespace :java do
|
15
|
+
java_gemspec = eval File.read('./character_set.gemspec')
|
16
|
+
java_gemspec.platform = 'java'
|
17
|
+
java_gemspec.extensions = []
|
18
|
+
|
19
|
+
Gem::PackageTask.new(java_gemspec) do |pkg|
|
20
|
+
pkg.need_zip = true
|
21
|
+
pkg.need_tar = true
|
22
|
+
pkg.package_dir = 'pkg'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
task package: 'java:gem'
|
27
|
+
|
28
|
+
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
29
|
+
task :sync_ruby_spec do
|
30
|
+
require 'fileutils'
|
31
|
+
|
32
|
+
variants = {
|
33
|
+
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
34
|
+
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
35
|
+
}
|
36
|
+
variants.each do |_, dir|
|
37
|
+
FileUtils.rm_rf(dir) if File.exist?(dir)
|
38
|
+
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
39
|
+
end
|
40
|
+
|
41
|
+
base = variants.first[1]
|
42
|
+
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
43
|
+
|
44
|
+
variants.each.with_index do |(class_name, dir), i|
|
45
|
+
Dir["#{dir}/**/*.rb"].each do |spec|
|
46
|
+
# remove some tests that do not apply or are covered otherwise
|
47
|
+
if spec =~ %r{/(flatten|initialize|pretty_print)}
|
48
|
+
File.delete(spec)
|
49
|
+
next
|
50
|
+
end
|
51
|
+
|
52
|
+
# some examples w. Strings must be adapted, "mspec" made rspec-compatible,
|
53
|
+
# and `i` added to shared example names or they'll override each other
|
54
|
+
adapted_content =
|
55
|
+
File
|
56
|
+
.read(spec)
|
57
|
+
.gsub('SortedSet', class_name)
|
58
|
+
.gsub('sorted_set_', "sorted_set_#{i}_")
|
59
|
+
.gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
|
60
|
+
.gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
|
61
|
+
.gsub('"one"', '1')
|
62
|
+
.gsub('"two"', '2')
|
63
|
+
.gsub('"three"', '3')
|
64
|
+
.gsub('"four"', '4')
|
65
|
+
.gsub('"five"', '5')
|
66
|
+
.gsub('@method', 'method')
|
67
|
+
.gsub(/be_(false|true)/, 'be \1')
|
68
|
+
.gsub('mock', 'double')
|
69
|
+
|
70
|
+
File.open(spec, 'w') { |f| f.puts adapted_content }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
desc 'Download unicode casefold data and write new C header file'
|
76
|
+
task :sync_casefold_data do
|
77
|
+
src_path = './CaseFolding.txt'
|
78
|
+
dst_path = './ext/character_set/unicode_casefold_table.h'
|
79
|
+
|
80
|
+
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
81
|
+
|
82
|
+
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
83
|
+
from, type, to = line.split(/\s*;\s*/).first(3)
|
84
|
+
# type 'C' stands for 'common', excludes mappings to multiple chars
|
85
|
+
hash[from] = to if type == 'C'
|
86
|
+
end.sort
|
87
|
+
|
88
|
+
File.open(dst_path, 'w') do |f|
|
89
|
+
f.puts <<-C
|
90
|
+
// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
|
91
|
+
|
92
|
+
typedef struct casefold_mapping {
|
93
|
+
unsigned long from;
|
94
|
+
unsigned long to;
|
95
|
+
} casefold_mapping;
|
96
|
+
|
97
|
+
#define CASEFOLD_COUNT #{mapping.size}
|
98
|
+
|
99
|
+
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
100
|
+
C
|
101
|
+
|
102
|
+
mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
|
103
|
+
|
104
|
+
f.puts '};'
|
105
|
+
end
|
106
|
+
|
107
|
+
File.unlink(src_path)
|
108
|
+
end
|
109
|
+
|
110
|
+
desc 'Run all IPS benchmarks'
|
111
|
+
task :benchmark do
|
112
|
+
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
113
|
+
end
|
114
|
+
|
115
|
+
namespace :benchmark do
|
116
|
+
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
117
|
+
task :write_to_file do
|
118
|
+
$store_comparison_results = {}
|
119
|
+
|
120
|
+
Rake.application[:benchmark].invoke
|
121
|
+
|
122
|
+
File.open('BENCHMARK.md', 'w') do |f|
|
123
|
+
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
124
|
+
|
125
|
+
$store_comparison_results.each do |caption, result|
|
126
|
+
f.puts '```', caption, '',
|
127
|
+
result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
unless RUBY_PLATFORM =~ /java/
|
134
|
+
# recompile before benchmarking or running specs
|
135
|
+
task(:benchmark).enhance([:compile])
|
136
|
+
task(:spec).enhance([:compile])
|
137
|
+
end
|
data/benchmarks/cover.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\S/
|
5
|
+
cs = CharacterSet.whitespace.inversion
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Detecting non-whitespace',
|
9
|
+
cases: {
|
10
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
11
|
+
'CharacterSet#cover?' => -> { cs.cover?(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum et dolorem'
|
16
|
+
rx = /[^a-z]/i
|
17
|
+
cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z')
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Detecting non-letters',
|
21
|
+
cases: {
|
22
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
23
|
+
'CharacterSet#cover?' => -> { cs.cover?(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\s/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Removing whitespace',
|
9
|
+
cases: {
|
10
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
11
|
+
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
|
+
rx = /[\s\p{emoji}äüö]/
|
17
|
+
cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Removing whitespace, emoji and umlauts',
|
21
|
+
cases: {
|
22
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
23
|
+
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\S/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Removing non-whitespace',
|
9
|
+
cases: {
|
10
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
11
|
+
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum ⛷ et dolorem'
|
16
|
+
rx = /\p{^emoji}/
|
17
|
+
cs = CharacterSet.emoji
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Extracting emoji',
|
21
|
+
cases: {
|
22
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
23
|
+
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
lib = File.expand_path('../lib', __dir__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'benchmark/ips'
|
5
|
+
require 'character_set'
|
6
|
+
|
7
|
+
def benchmark(caption: nil, cases: {})
|
8
|
+
puts caption
|
9
|
+
|
10
|
+
report = Benchmark.ips do |x|
|
11
|
+
cases.each do |label, callable|
|
12
|
+
x.report(label, &callable)
|
13
|
+
end
|
14
|
+
x.compare!
|
15
|
+
end
|
16
|
+
|
17
|
+
return unless $store_comparison_results
|
18
|
+
|
19
|
+
old_stdout = $stdout.clone
|
20
|
+
captured_stdout = StringIO.new
|
21
|
+
$stdout = captured_stdout
|
22
|
+
report.run_comparison
|
23
|
+
$store_comparison_results[caption] = captured_stdout.string
|
24
|
+
$stdout = old_stdout
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\s/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Detecting whitespace',
|
9
|
+
cases: {
|
10
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
11
|
+
'CharacterSet#used_by?' => -> { cs.used_by?(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20
|
16
|
+
rx = /\p{emoji}/
|
17
|
+
cs = CharacterSet.emoji
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Detecting emoji in a large string',
|
21
|
+
cases: {
|
22
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
23
|
+
'CharacterSet#used_by?' => -> { cs.used_by?(str) },
|
24
|
+
}
|
25
|
+
)
|
data/bin/console
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
|
5
|
+
require 'character_set'
|
6
|
+
require 'character_set/core_ext'
|
7
|
+
require 'character_set/pure'
|
8
|
+
|
9
|
+
require 'regexp_property_values'
|
10
|
+
|
11
|
+
CS = CharacterSet
|
12
|
+
CP = CharacterSet::Pure
|
13
|
+
PV = RegexpPropertyValues
|
14
|
+
|
15
|
+
require 'benchmark'
|
16
|
+
def m(&block); Benchmark.measure(&block); end
|
17
|
+
|
18
|
+
require "irb"
|
19
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'character_set/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'character_set'
|
8
|
+
s.version = CharacterSet::VERSION
|
9
|
+
s.authors = ['Janosch Müller']
|
10
|
+
s.email = ['janosch84@gmail.com']
|
11
|
+
|
12
|
+
s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
|
13
|
+
s.homepage = 'https://github.com/janosch-x/character_set'
|
14
|
+
s.license = 'MIT'
|
15
|
+
|
16
|
+
s.files = `git ls-files -z`.split("\x0").reject do |f|
|
17
|
+
f.match(%r{^(test|spec|features)/})
|
18
|
+
end
|
19
|
+
s.require_paths = ['lib']
|
20
|
+
|
21
|
+
s.extensions = %w[ext/character_set/extconf.rb]
|
22
|
+
|
23
|
+
s.required_ruby_version = '>= 2.1.0'
|
24
|
+
|
25
|
+
s.add_dependency 'range_compressor', '~> 1.0'
|
26
|
+
|
27
|
+
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
28
|
+
s.add_development_dependency 'bundler', '~> 1.16'
|
29
|
+
s.add_development_dependency 'rake', '~> 10.0'
|
30
|
+
s.add_development_dependency 'rake-compiler', '~> 1.0'
|
31
|
+
s.add_development_dependency 'regexp_parser', '~> 1.0'
|
32
|
+
s.add_development_dependency 'regexp_property_values', '~> 0.3.2'
|
33
|
+
s.add_development_dependency 'rspec', '~> 3.0'
|
34
|
+
end
|