character_set 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d2e4067480e00d5d03db2bbd1ee4f222f936e0f2
|
4
|
+
data.tar.gz: 0e4c0bc6cf393b1a81dc368ee86f94d0dea10a82
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d9150168393512190a496ed10af91a1eaa49eb2a01d3fb623de9586eb4fbd354dfea172bf6174ab180f6620ae6ca13a01f94ec26a95fbf118f48f611b4d7acd7
|
7
|
+
data.tar.gz: cb4b067fae5c8a550267a0dcef7708b30d36598b2ed18981711ad9b4a67b23cbf444270f7006d160e50f151ba32fe3402108429d415f7adbfb0be9160fedfda7
|
data/.gitignore
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
*.bundle
|
2
|
+
*.gem
|
3
|
+
*.iml
|
4
|
+
*.stTheme.cache
|
5
|
+
*.sublime-project
|
6
|
+
*.sublime-workspace
|
7
|
+
*.swp
|
8
|
+
*.tmlanguage.cache
|
9
|
+
*.tmPreferences.cache
|
10
|
+
*~
|
11
|
+
.byebug_history
|
12
|
+
.DS_Store
|
13
|
+
.idea/
|
14
|
+
.ruby-gemset
|
15
|
+
.ruby-version
|
16
|
+
.tags
|
17
|
+
.tags1
|
18
|
+
bbin/
|
19
|
+
binstubs/*
|
20
|
+
bundler_stubs/*/.yardoc
|
21
|
+
Gemfile.lock
|
22
|
+
/.bundle/
|
23
|
+
/_yardoc/
|
24
|
+
/coverage/
|
25
|
+
/doc/
|
26
|
+
/pkg/
|
27
|
+
/spec/reports/
|
28
|
+
/tmp/
|
29
|
+
|
30
|
+
# rspec failure tracking
|
31
|
+
.rspec_status
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/BENCHMARK.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
|
2
|
+
|
3
|
+
```
|
4
|
+
Detecting non-whitespace
|
5
|
+
|
6
|
+
CharacterSet#cover?: 13244577.7 i/s
|
7
|
+
Regexp#match?: 8027017.5 i/s - 1.65x slower
|
8
|
+
```
|
9
|
+
```
|
10
|
+
Detecting non-letters
|
11
|
+
|
12
|
+
CharacterSet#cover?: 13082940.8 i/s
|
13
|
+
Regexp#match?: 5372589.2 i/s - 2.44x slower
|
14
|
+
```
|
15
|
+
```
|
16
|
+
Removing whitespace
|
17
|
+
|
18
|
+
CharacterSet#delete_in: 389315.6 i/s
|
19
|
+
String#gsub: 223773.5 i/s - 1.74x slower
|
20
|
+
```
|
21
|
+
```
|
22
|
+
Removing whitespace, emoji and umlauts
|
23
|
+
|
24
|
+
CharacterSet#delete_in: 470239.3 i/s
|
25
|
+
String#gsub: 278679.4 i/s - 1.69x slower
|
26
|
+
```
|
27
|
+
```
|
28
|
+
Removing non-whitespace
|
29
|
+
|
30
|
+
CharacterSet#keep_in: 1138461.0 i/s
|
31
|
+
String#gsub: 235287.4 i/s - 4.84x slower
|
32
|
+
```
|
33
|
+
```
|
34
|
+
Extracting emoji
|
35
|
+
|
36
|
+
CharacterSet#keep_in: 1474472.0 i/s
|
37
|
+
String#gsub: 212269.6 i/s - 6.95x slower
|
38
|
+
```
|
39
|
+
```
|
40
|
+
Detecting whitespace
|
41
|
+
|
42
|
+
CharacterSet#used_by?: 13063108.7 i/s
|
43
|
+
Regexp#match?: 7215075.0 i/s - 1.81x slower
|
44
|
+
```
|
45
|
+
```
|
46
|
+
Detecting emoji in a large string
|
47
|
+
|
48
|
+
CharacterSet#used_by?: 246527.7 i/s
|
49
|
+
Regexp#match?: 92956.5 i/s - 2.65x slower
|
50
|
+
```
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2018 Janosch Müller
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
# CharacterSet
|
2
|
+
|
3
|
+
[](http://badge.fury.io/rb/character_set)
|
4
|
+
[](https://travis-ci.org/janosch-x/character_set)
|
5
|
+
|
6
|
+
A gem to build, read, write and compare sets of Unicode codepoints.
|
7
|
+
|
8
|
+
Many parts can be used independently, e.g.:
|
9
|
+
- `CharacterSet::Character`
|
10
|
+
- `CharacterSet::Parser`
|
11
|
+
- `CharacterSet::Writer`
|
12
|
+
- [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
### Parse/Initialize
|
17
|
+
|
18
|
+
These all produce a `CharacterSet` containing `a`, `b` and `c`:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
CharacterSet['a', 'b', 'c']
|
22
|
+
CharacterSet[97, 98, 99]
|
23
|
+
CharacterSet.new('a'..'c')
|
24
|
+
CharacterSet.new(0x61..0x63)
|
25
|
+
CharacterSet.of('abacababa')
|
26
|
+
CharacterSet.parse('[a-c]')
|
27
|
+
CharacterSet.parse('\U00000061-\U00000063')
|
28
|
+
```
|
29
|
+
|
30
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
# are there any non-digit ascii chars classified as emoji?
|
34
|
+
set = CharacterSet.of_regexp(/[\D&&[:ascii:]&&\p{emoji}]/)
|
35
|
+
|
36
|
+
# ... of course there are!
|
37
|
+
set.to_a(stringify: true) # => ["#", "*"]
|
38
|
+
|
39
|
+
# with the core extension:
|
40
|
+
require 'character_set/core_ext/regexp_ext'
|
41
|
+
/[a-e&&[^c]]/.character_set # => CharacterSet['a', 'b', 'd', 'e']
|
42
|
+
```
|
43
|
+
|
44
|
+
### Common utility sets
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
CharacterSet.ascii
|
48
|
+
CharacterSet.bmp
|
49
|
+
CharacterSet.crypt
|
50
|
+
CharacterSet.emoji
|
51
|
+
CharacterSet.newline
|
52
|
+
CharacterSet.unicode
|
53
|
+
CharacterSet.url_fragment
|
54
|
+
CharacterSet.url_host
|
55
|
+
CharacterSet.url_path
|
56
|
+
CharacterSet.url_query
|
57
|
+
CharacterSet.whitespace
|
58
|
+
|
59
|
+
# e.g.
|
60
|
+
CharacterSet.url_query.cover?('?a=(b$c;)') # => true
|
61
|
+
CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"]
|
62
|
+
|
63
|
+
# all can be prefixed with `non_`, e.g.
|
64
|
+
CharacterSet.non_ascii.delete_in(string)
|
65
|
+
```
|
66
|
+
|
67
|
+
### Interact with Strings
|
68
|
+
|
69
|
+
CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
|
70
|
+
|
71
|
+
`#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
CharacterSet.ascii.used_by?('Tüür') # => true
|
75
|
+
CharacterSet.ascii.cover?('Tüür') # => false
|
76
|
+
CharacterSet.ascii.cover?('Tr') # => true
|
77
|
+
```
|
78
|
+
|
79
|
+
`#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
|
80
|
+
```ruby
|
81
|
+
string = 'Tüür'
|
82
|
+
|
83
|
+
CharacterSet.ascii.delete_in(string) # => 'üü'
|
84
|
+
CharacterSet.ascii.keep_in(string) # => 'Tr'
|
85
|
+
string # => 'Tüür'
|
86
|
+
|
87
|
+
CharacterSet.ascii.delete_in!(string) # => 'üü'
|
88
|
+
string # => 'üü'
|
89
|
+
CharacterSet.ascii.keep_in!(string) # => ''
|
90
|
+
string # => ''
|
91
|
+
```
|
92
|
+
|
93
|
+
There is also a core extension for String interaction.
|
94
|
+
```ruby
|
95
|
+
require 'character_set/core_ext/string_ext'
|
96
|
+
|
97
|
+
"a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"]
|
98
|
+
"a\rb".uses_character_set?(CharacterSet.emoji) # => false
|
99
|
+
"a\rb".covered_by_character_set?(CharacterSet.newline) # => false
|
100
|
+
"a\rb".delete_character_set(CharacterSet.newline) # => 'ab'
|
101
|
+
# etc.
|
102
|
+
```
|
103
|
+
|
104
|
+
### Manipulate
|
105
|
+
|
106
|
+
Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
|
107
|
+
|
108
|
+
Where appropriate, methods take both chars and codepoints, e.g.:
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
CharacterSet['a'].add('b') # => CharacterSet['a', 'b']
|
112
|
+
CharacterSet['a'].add(98) # => CharacterSet['a', 'b']
|
113
|
+
CharacterSet['a'].include?('a') # => true
|
114
|
+
CharacterSet['a'].include?(0x61) # => true
|
115
|
+
```
|
116
|
+
|
117
|
+
`#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
non_a = CharacterSet['a'].inversion
|
121
|
+
# => #<CharacterSet (size: 1112063)>
|
122
|
+
|
123
|
+
non_a.include?('a') # => false
|
124
|
+
non_a.include?('ü') # => true
|
125
|
+
|
126
|
+
# surrogate pair halves are not included by default
|
127
|
+
CharacterSet['a'].inversion(include_surrogates: true)
|
128
|
+
# => #<CharacterSet (size: 1114111)>
|
129
|
+
```
|
130
|
+
|
131
|
+
`#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
|
132
|
+
|
133
|
+
```ruby
|
134
|
+
CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
135
|
+
```
|
136
|
+
|
137
|
+
### Write
|
138
|
+
```ruby
|
139
|
+
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
140
|
+
|
141
|
+
# safely printable ASCII chars are not escaped by default
|
142
|
+
set.to_s # => 'a-cj\x2D'
|
143
|
+
set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D'
|
144
|
+
|
145
|
+
# brackets may be added
|
146
|
+
set.to_s(in_brackets: true) # => '[a-cj\x2D]'
|
147
|
+
|
148
|
+
# the default escape format is Ruby/ES6 compatible, others are available
|
149
|
+
set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩']
|
150
|
+
set.to_s # => 'a-c\u0258\u{1F929}'
|
151
|
+
set.to_s(format: 'U+') # => 'a-cU+0258U+1F929'
|
152
|
+
set.to_s(format: 'Python') # => "a-c\u0258\U0001F929"
|
153
|
+
set.to_s(format: 'raw') # => 'a-cɘ🤩'
|
154
|
+
|
155
|
+
# or pass a block
|
156
|
+
set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]"
|
157
|
+
set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
158
|
+
|
159
|
+
# disable abbreviation (grouping of codepoints in ranges)
|
160
|
+
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
161
|
+
|
162
|
+
# for full js regex compatibility in case of astral members:
|
163
|
+
set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
|
164
|
+
```
|
165
|
+
|
166
|
+
### Unicode plane methods
|
167
|
+
|
168
|
+
There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
169
|
+
```Ruby
|
170
|
+
CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
|
171
|
+
CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
|
172
|
+
CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
|
173
|
+
CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
|
174
|
+
CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
175
|
+
CharacterSet::Character.new('a').plane # => 0
|
176
|
+
```
|
177
|
+
|
178
|
+
### Contributions
|
179
|
+
|
180
|
+
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
|
+
require 'rubygems/package_task'
|
4
|
+
require 'rake/extensiontask'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
7
|
+
|
8
|
+
task default: :spec
|
9
|
+
|
10
|
+
Rake::ExtensionTask.new('character_set') do |ext|
|
11
|
+
ext.lib_dir = 'lib/character_set'
|
12
|
+
end
|
13
|
+
|
14
|
+
namespace :java do
|
15
|
+
java_gemspec = eval File.read('./character_set.gemspec')
|
16
|
+
java_gemspec.platform = 'java'
|
17
|
+
java_gemspec.extensions = []
|
18
|
+
|
19
|
+
Gem::PackageTask.new(java_gemspec) do |pkg|
|
20
|
+
pkg.need_zip = true
|
21
|
+
pkg.need_tar = true
|
22
|
+
pkg.package_dir = 'pkg'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
task package: 'java:gem'
|
27
|
+
|
28
|
+
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
29
|
+
task :sync_ruby_spec do
|
30
|
+
require 'fileutils'
|
31
|
+
|
32
|
+
variants = {
|
33
|
+
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
34
|
+
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
35
|
+
}
|
36
|
+
variants.each do |_, dir|
|
37
|
+
FileUtils.rm_rf(dir) if File.exist?(dir)
|
38
|
+
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
39
|
+
end
|
40
|
+
|
41
|
+
base = variants.first[1]
|
42
|
+
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
43
|
+
|
44
|
+
variants.each.with_index do |(class_name, dir), i|
|
45
|
+
Dir["#{dir}/**/*.rb"].each do |spec|
|
46
|
+
# remove some tests that do not apply or are covered otherwise
|
47
|
+
if spec =~ %r{/(flatten|initialize|pretty_print)}
|
48
|
+
File.delete(spec)
|
49
|
+
next
|
50
|
+
end
|
51
|
+
|
52
|
+
# some examples w. Strings must be adapted, "mspec" made rspec-compatible,
|
53
|
+
# and `i` added to shared example names or they'll override each other
|
54
|
+
adapted_content =
|
55
|
+
File
|
56
|
+
.read(spec)
|
57
|
+
.gsub('SortedSet', class_name)
|
58
|
+
.gsub('sorted_set_', "sorted_set_#{i}_")
|
59
|
+
.gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
|
60
|
+
.gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
|
61
|
+
.gsub('"one"', '1')
|
62
|
+
.gsub('"two"', '2')
|
63
|
+
.gsub('"three"', '3')
|
64
|
+
.gsub('"four"', '4')
|
65
|
+
.gsub('"five"', '5')
|
66
|
+
.gsub('@method', 'method')
|
67
|
+
.gsub(/be_(false|true)/, 'be \1')
|
68
|
+
.gsub('mock', 'double')
|
69
|
+
|
70
|
+
File.open(spec, 'w') { |f| f.puts adapted_content }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
desc 'Download unicode casefold data and write new C header file'
|
76
|
+
task :sync_casefold_data do
|
77
|
+
src_path = './CaseFolding.txt'
|
78
|
+
dst_path = './ext/character_set/unicode_casefold_table.h'
|
79
|
+
|
80
|
+
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
81
|
+
|
82
|
+
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
83
|
+
from, type, to = line.split(/\s*;\s*/).first(3)
|
84
|
+
# type 'C' stands for 'common', excludes mappings to multiple chars
|
85
|
+
hash[from] = to if type == 'C'
|
86
|
+
end.sort
|
87
|
+
|
88
|
+
File.open(dst_path, 'w') do |f|
|
89
|
+
f.puts <<-C
|
90
|
+
// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
|
91
|
+
|
92
|
+
typedef struct casefold_mapping {
|
93
|
+
unsigned long from;
|
94
|
+
unsigned long to;
|
95
|
+
} casefold_mapping;
|
96
|
+
|
97
|
+
#define CASEFOLD_COUNT #{mapping.size}
|
98
|
+
|
99
|
+
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
100
|
+
C
|
101
|
+
|
102
|
+
mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
|
103
|
+
|
104
|
+
f.puts '};'
|
105
|
+
end
|
106
|
+
|
107
|
+
File.unlink(src_path)
|
108
|
+
end
|
109
|
+
|
110
|
+
desc 'Run all IPS benchmarks'
|
111
|
+
task :benchmark do
|
112
|
+
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
113
|
+
end
|
114
|
+
|
115
|
+
namespace :benchmark do
|
116
|
+
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
117
|
+
task :write_to_file do
|
118
|
+
$store_comparison_results = {}
|
119
|
+
|
120
|
+
Rake.application[:benchmark].invoke
|
121
|
+
|
122
|
+
File.open('BENCHMARK.md', 'w') do |f|
|
123
|
+
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
124
|
+
|
125
|
+
$store_comparison_results.each do |caption, result|
|
126
|
+
f.puts '```', caption, '',
|
127
|
+
result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
unless RUBY_PLATFORM =~ /java/
|
134
|
+
# recompile before benchmarking or running specs
|
135
|
+
task(:benchmark).enhance([:compile])
|
136
|
+
task(:spec).enhance([:compile])
|
137
|
+
end
|
data/benchmarks/cover.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\S/
|
5
|
+
cs = CharacterSet.whitespace.inversion
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Detecting non-whitespace',
|
9
|
+
cases: {
|
10
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
11
|
+
'CharacterSet#cover?' => -> { cs.cover?(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum et dolorem'
|
16
|
+
rx = /[^a-z]/i
|
17
|
+
cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z')
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Detecting non-letters',
|
21
|
+
cases: {
|
22
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
23
|
+
'CharacterSet#cover?' => -> { cs.cover?(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\s/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Removing whitespace',
|
9
|
+
cases: {
|
10
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
11
|
+
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
|
+
rx = /[\s\p{emoji}äüö]/
|
17
|
+
cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Removing whitespace, emoji and umlauts',
|
21
|
+
cases: {
|
22
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
23
|
+
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\S/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Removing non-whitespace',
|
9
|
+
cases: {
|
10
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
11
|
+
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum ⛷ et dolorem'
|
16
|
+
rx = /\p{^emoji}/
|
17
|
+
cs = CharacterSet.emoji
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Extracting emoji',
|
21
|
+
cases: {
|
22
|
+
'String#gsub' => -> { str.gsub(rx, '') },
|
23
|
+
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
24
|
+
}
|
25
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
lib = File.expand_path('../lib', __dir__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'benchmark/ips'
|
5
|
+
require 'character_set'
|
6
|
+
|
7
|
+
def benchmark(caption: nil, cases: {})
|
8
|
+
puts caption
|
9
|
+
|
10
|
+
report = Benchmark.ips do |x|
|
11
|
+
cases.each do |label, callable|
|
12
|
+
x.report(label, &callable)
|
13
|
+
end
|
14
|
+
x.compare!
|
15
|
+
end
|
16
|
+
|
17
|
+
return unless $store_comparison_results
|
18
|
+
|
19
|
+
old_stdout = $stdout.clone
|
20
|
+
captured_stdout = StringIO.new
|
21
|
+
$stdout = captured_stdout
|
22
|
+
report.run_comparison
|
23
|
+
$store_comparison_results[caption] = captured_stdout.string
|
24
|
+
$stdout = old_stdout
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
rx = /\s/
|
5
|
+
cs = CharacterSet.whitespace
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Detecting whitespace',
|
9
|
+
cases: {
|
10
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
11
|
+
'CharacterSet#used_by?' => -> { cs.used_by?(str) },
|
12
|
+
}
|
13
|
+
)
|
14
|
+
|
15
|
+
str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20
|
16
|
+
rx = /\p{emoji}/
|
17
|
+
cs = CharacterSet.emoji
|
18
|
+
|
19
|
+
benchmark(
|
20
|
+
caption: 'Detecting emoji in a large string',
|
21
|
+
cases: {
|
22
|
+
'Regexp#match?' => -> { rx.match?(str) },
|
23
|
+
'CharacterSet#used_by?' => -> { cs.used_by?(str) },
|
24
|
+
}
|
25
|
+
)
|
data/bin/console
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
|
5
|
+
require 'character_set'
|
6
|
+
require 'character_set/core_ext'
|
7
|
+
require 'character_set/pure'
|
8
|
+
|
9
|
+
require 'regexp_property_values'
|
10
|
+
|
11
|
+
CS = CharacterSet
|
12
|
+
CP = CharacterSet::Pure
|
13
|
+
PV = RegexpPropertyValues
|
14
|
+
|
15
|
+
require 'benchmark'
|
16
|
+
def m(&block); Benchmark.measure(&block); end
|
17
|
+
|
18
|
+
require "irb"
|
19
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'character_set/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'character_set'
|
8
|
+
s.version = CharacterSet::VERSION
|
9
|
+
s.authors = ['Janosch Müller']
|
10
|
+
s.email = ['janosch84@gmail.com']
|
11
|
+
|
12
|
+
s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
|
13
|
+
s.homepage = 'https://github.com/janosch-x/character_set'
|
14
|
+
s.license = 'MIT'
|
15
|
+
|
16
|
+
s.files = `git ls-files -z`.split("\x0").reject do |f|
|
17
|
+
f.match(%r{^(test|spec|features)/})
|
18
|
+
end
|
19
|
+
s.require_paths = ['lib']
|
20
|
+
|
21
|
+
s.extensions = %w[ext/character_set/extconf.rb]
|
22
|
+
|
23
|
+
s.required_ruby_version = '>= 2.1.0'
|
24
|
+
|
25
|
+
s.add_dependency 'range_compressor', '~> 1.0'
|
26
|
+
|
27
|
+
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
28
|
+
s.add_development_dependency 'bundler', '~> 1.16'
|
29
|
+
s.add_development_dependency 'rake', '~> 10.0'
|
30
|
+
s.add_development_dependency 'rake-compiler', '~> 1.0'
|
31
|
+
s.add_development_dependency 'regexp_parser', '~> 1.0'
|
32
|
+
s.add_development_dependency 'regexp_property_values', '~> 0.3.2'
|
33
|
+
s.add_development_dependency 'rspec', '~> 3.0'
|
34
|
+
end
|