character_set 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d2e4067480e00d5d03db2bbd1ee4f222f936e0f2
4
+ data.tar.gz: 0e4c0bc6cf393b1a81dc368ee86f94d0dea10a82
5
+ SHA512:
6
+ metadata.gz: d9150168393512190a496ed10af91a1eaa49eb2a01d3fb623de9586eb4fbd354dfea172bf6174ab180f6620ae6ca13a01f94ec26a95fbf118f48f611b4d7acd7
7
+ data.tar.gz: cb4b067fae5c8a550267a0dcef7708b30d36598b2ed18981711ad9b4a67b23cbf444270f7006d160e50f151ba32fe3402108429d415f7adbfb0be9160fedfda7
data/.gitignore ADDED
@@ -0,0 +1,31 @@
1
+ *.bundle
2
+ *.gem
3
+ *.iml
4
+ *.stTheme.cache
5
+ *.sublime-project
6
+ *.sublime-workspace
7
+ *.swp
8
+ *.tmlanguage.cache
9
+ *.tmPreferences.cache
10
+ *~
11
+ .byebug_history
12
+ .DS_Store
13
+ .idea/
14
+ .ruby-gemset
15
+ .ruby-version
16
+ .tags
17
+ .tags1
18
+ bbin/
19
+ binstubs/*
20
+ bundler_stubs/*/.yardoc
21
+ Gemfile.lock
22
+ /.bundle/
23
+ /_yardoc/
24
+ /coverage/
25
+ /doc/
26
+ /pkg/
27
+ /spec/reports/
28
+ /tmp/
29
+
30
+ # rspec failure tracking
31
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.1
5
+ - 2.4
6
+ - 2.5
7
+ - 2.6
8
+ - jruby-9.1.9.0
9
+ before_install:
10
+ - gem update --system
11
+ - gem install bundler
data/BENCHMARK.md ADDED
@@ -0,0 +1,50 @@
1
+ Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
2
+
3
+ ```
4
+ Detecting non-whitespace
5
+
6
+ CharacterSet#cover?: 13244577.7 i/s
7
+ Regexp#match?: 8027017.5 i/s - 1.65x slower
8
+ ```
9
+ ```
10
+ Detecting non-letters
11
+
12
+ CharacterSet#cover?: 13082940.8 i/s
13
+ Regexp#match?: 5372589.2 i/s - 2.44x slower
14
+ ```
15
+ ```
16
+ Removing whitespace
17
+
18
+ CharacterSet#delete_in: 389315.6 i/s
19
+ String#gsub: 223773.5 i/s - 1.74x slower
20
+ ```
21
+ ```
22
+ Removing whitespace, emoji and umlauts
23
+
24
+ CharacterSet#delete_in: 470239.3 i/s
25
+ String#gsub: 278679.4 i/s - 1.69x slower
26
+ ```
27
+ ```
28
+ Removing non-whitespace
29
+
30
+ CharacterSet#keep_in: 1138461.0 i/s
31
+ String#gsub: 235287.4 i/s - 4.84x slower
32
+ ```
33
+ ```
34
+ Extracting emoji
35
+
36
+ CharacterSet#keep_in: 1474472.0 i/s
37
+ String#gsub: 212269.6 i/s - 6.95x slower
38
+ ```
39
+ ```
40
+ Detecting whitespace
41
+
42
+ CharacterSet#used_by?: 13063108.7 i/s
43
+ Regexp#match?: 7215075.0 i/s - 1.81x slower
44
+ ```
45
+ ```
46
+ Detecting emoji in a large string
47
+
48
+ CharacterSet#used_by?: 246527.7 i/s
49
+ Regexp#match?: 92956.5 i/s - 2.65x slower
50
+ ```
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in character_set.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Janosch Müller
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,180 @@
1
+ # CharacterSet
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
+ [![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
5
+
6
+ A gem to build, read, write and compare sets of Unicode codepoints.
7
+
8
+ Many parts can be used independently, e.g.:
9
+ - `CharacterSet::Character`
10
+ - `CharacterSet::Parser`
11
+ - `CharacterSet::Writer`
12
+ - [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
13
+
14
+ ## Usage
15
+
16
+ ### Parse/Initialize
17
+
18
+ These all produce a `CharacterSet` containing `a`, `b` and `c`:
19
+
20
+ ```ruby
21
+ CharacterSet['a', 'b', 'c']
22
+ CharacterSet[97, 98, 99]
23
+ CharacterSet.new('a'..'c')
24
+ CharacterSet.new(0x61..0x63)
25
+ CharacterSet.of('abacababa')
26
+ CharacterSet.parse('[a-c]')
27
+ CharacterSet.parse('\U00000061-\U00000063')
28
+ ```
29
+
30
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting:
31
+
32
+ ```ruby
33
+ # are there any non-digit ascii chars classified as emoji?
34
+ set = CharacterSet.of_regexp(/[\D&&[:ascii:]&&\p{emoji}]/)
35
+
36
+ # ... of course there are!
37
+ set.to_a(stringify: true) # => ["#", "*"]
38
+
39
+ # with the core extension:
40
+ require 'character_set/core_ext/regexp_ext'
41
+ /[a-e&&[^c]]/.character_set # => CharacterSet['a', 'b', 'd', 'e']
42
+ ```
43
+
44
+ ### Common utility sets
45
+
46
+ ```ruby
47
+ CharacterSet.ascii
48
+ CharacterSet.bmp
49
+ CharacterSet.crypt
50
+ CharacterSet.emoji
51
+ CharacterSet.newline
52
+ CharacterSet.unicode
53
+ CharacterSet.url_fragment
54
+ CharacterSet.url_host
55
+ CharacterSet.url_path
56
+ CharacterSet.url_query
57
+ CharacterSet.whitespace
58
+
59
+ # e.g.
60
+ CharacterSet.url_query.cover?('?a=(b$c;)') # => true
61
+ CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"]
62
+
63
+ # all can be prefixed with `non_`, e.g.
64
+ CharacterSet.non_ascii.delete_in(string)
65
+ ```
66
+
67
+ ### Interact with Strings
68
+
69
+ CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
70
+
71
+ `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
72
+
73
+ ```ruby
74
+ CharacterSet.ascii.used_by?('Tüür') # => true
75
+ CharacterSet.ascii.cover?('Tüür') # => false
76
+ CharacterSet.ascii.cover?('Tr') # => true
77
+ ```
78
+
79
+ `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
80
+ ```ruby
81
+ string = 'Tüür'
82
+
83
+ CharacterSet.ascii.delete_in(string) # => 'üü'
84
+ CharacterSet.ascii.keep_in(string) # => 'Tr'
85
+ string # => 'Tüür'
86
+
87
+ CharacterSet.ascii.delete_in!(string) # => 'üü'
88
+ string # => 'üü'
89
+ CharacterSet.ascii.keep_in!(string) # => ''
90
+ string # => ''
91
+ ```
92
+
93
+ There is also a core extension for String interaction.
94
+ ```ruby
95
+ require 'character_set/core_ext/string_ext'
96
+
97
+ "a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"]
98
+ "a\rb".uses_character_set?(CharacterSet.emoji) # => false
99
+ "a\rb".covered_by_character_set?(CharacterSet.newline) # => false
100
+ "a\rb".delete_character_set(CharacterSet.newline) # => 'ab'
101
+ # etc.
102
+ ```
103
+
104
+ ### Manipulate
105
+
106
+ Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
107
+
108
+ Where appropriate, methods take both chars and codepoints, e.g.:
109
+
110
+ ```ruby
111
+ CharacterSet['a'].add('b') # => CharacterSet['a', 'b']
112
+ CharacterSet['a'].add(98) # => CharacterSet['a', 'b']
113
+ CharacterSet['a'].include?('a') # => true
114
+ CharacterSet['a'].include?(0x61) # => true
115
+ ```
116
+
117
+ `#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set:
118
+
119
+ ```ruby
120
+ non_a = CharacterSet['a'].inversion
121
+ # => #<CharacterSet (size: 1112063)>
122
+
123
+ non_a.include?('a') # => false
124
+ non_a.include?('ü') # => true
125
+
126
+ # surrogate pair halves are not included by default
127
+ CharacterSet['a'].inversion(include_surrogates: true)
128
+ # => #<CharacterSet (size: 1114111)>
129
+ ```
130
+
131
+ `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
132
+
133
+ ```ruby
134
+ CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
135
+ ```
136
+
137
+ ### Write
138
+ ```ruby
139
+ set = CharacterSet['a', 'b', 'c', 'j', '-']
140
+
141
+ # safely printable ASCII chars are not escaped by default
142
+ set.to_s # => 'a-cj\x2D'
143
+ set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D'
144
+
145
+ # brackets may be added
146
+ set.to_s(in_brackets: true) # => '[a-cj\x2D]'
147
+
148
+ # the default escape format is Ruby/ES6 compatible, others are available
149
+ set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩']
150
+ set.to_s # => 'a-c\u0258\u{1F929}'
151
+ set.to_s(format: 'U+') # => 'a-cU+0258U+1F929'
152
+ set.to_s(format: 'Python') # => "a-c\u0258\U0001F929"
153
+ set.to_s(format: 'raw') # => 'a-cɘ🤩'
154
+
155
+ # or pass a block
156
+ set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]"
157
+ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
158
+
159
+ # disable abbreviation (grouping of codepoints in ranges)
160
+ set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
161
+
162
+ # for full js regex compatibility in case of astral members:
163
+ set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
164
+ ```
165
+
166
+ ### Unicode plane methods
167
+
168
+ There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
169
+ ```Ruby
170
+ CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
171
+ CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
172
+ CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
173
+ CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
174
+ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
175
+ CharacterSet::Character.new('a').plane # => 0
176
+ ```
177
+
178
+ ### Contributions
179
+
180
+ Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile ADDED
@@ -0,0 +1,137 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubygems/package_task'
4
+ require 'rake/extensiontask'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
9
+
10
+ Rake::ExtensionTask.new('character_set') do |ext|
11
+ ext.lib_dir = 'lib/character_set'
12
+ end
13
+
14
+ namespace :java do
15
+ java_gemspec = eval File.read('./character_set.gemspec')
16
+ java_gemspec.platform = 'java'
17
+ java_gemspec.extensions = []
18
+
19
+ Gem::PackageTask.new(java_gemspec) do |pkg|
20
+ pkg.need_zip = true
21
+ pkg.need_tar = true
22
+ pkg.package_dir = 'pkg'
23
+ end
24
+ end
25
+
26
+ task package: 'java:gem'
27
+
28
+ desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
29
+ task :sync_ruby_spec do
30
+ require 'fileutils'
31
+
32
+ variants = {
33
+ 'CharacterSet' => './spec/ruby-spec/library/character_set',
34
+ 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
35
+ }
36
+ variants.each do |_, dir|
37
+ FileUtils.rm_rf(dir) if File.exist?(dir)
38
+ `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
39
+ end
40
+
41
+ base = variants.first[1]
42
+ variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
43
+
44
+ variants.each.with_index do |(class_name, dir), i|
45
+ Dir["#{dir}/**/*.rb"].each do |spec|
46
+ # remove some tests that do not apply or are covered otherwise
47
+ if spec =~ %r{/(flatten|initialize|pretty_print)}
48
+ File.delete(spec)
49
+ next
50
+ end
51
+
52
+ # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
53
+ # and `i` added to shared example names or they'll override each other
54
+ adapted_content =
55
+ File
56
+ .read(spec)
57
+ .gsub('SortedSet', class_name)
58
+ .gsub('sorted_set_', "sorted_set_#{i}_")
59
+ .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
60
+ .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
61
+ .gsub('"one"', '1')
62
+ .gsub('"two"', '2')
63
+ .gsub('"three"', '3')
64
+ .gsub('"four"', '4')
65
+ .gsub('"five"', '5')
66
+ .gsub('@method', 'method')
67
+ .gsub(/be_(false|true)/, 'be \1')
68
+ .gsub('mock', 'double')
69
+
70
+ File.open(spec, 'w') { |f| f.puts adapted_content }
71
+ end
72
+ end
73
+ end
74
+
75
+ desc 'Download unicode casefold data and write new C header file'
76
+ task :sync_casefold_data do
77
+ src_path = './CaseFolding.txt'
78
+ dst_path = './ext/character_set/unicode_casefold_table.h'
79
+
80
+ `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
81
+
82
+ mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
83
+ from, type, to = line.split(/\s*;\s*/).first(3)
84
+ # type 'C' stands for 'common', excludes mappings to multiple chars
85
+ hash[from] = to if type == 'C'
86
+ end.sort
87
+
88
+ File.open(dst_path, 'w') do |f|
89
+ f.puts <<-C
90
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
91
+
92
+ typedef struct casefold_mapping {
93
+ unsigned long from;
94
+ unsigned long to;
95
+ } casefold_mapping;
96
+
97
+ #define CASEFOLD_COUNT #{mapping.size}
98
+
99
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
100
+ C
101
+
102
+ mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
103
+
104
+ f.puts '};'
105
+ end
106
+
107
+ File.unlink(src_path)
108
+ end
109
+
110
+ desc 'Run all IPS benchmarks'
111
+ task :benchmark do
112
+ Dir['./benchmarks/*.rb'].sort.each { |file| require file }
113
+ end
114
+
115
+ namespace :benchmark do
116
+ desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
117
+ task :write_to_file do
118
+ $store_comparison_results = {}
119
+
120
+ Rake.application[:benchmark].invoke
121
+
122
+ File.open('BENCHMARK.md', 'w') do |f|
123
+ f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
124
+
125
+ $store_comparison_results.each do |caption, result|
126
+ f.puts '```', caption, '',
127
+ result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ unless RUBY_PLATFORM =~ /java/
134
+ # recompile before benchmarking or running specs
135
+ task(:benchmark).enhance([:compile])
136
+ task(:spec).enhance([:compile])
137
+ end
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\S/
5
+ cs = CharacterSet.whitespace.inversion
6
+
7
+ benchmark(
8
+ caption: 'Detecting non-whitespace',
9
+ cases: {
10
+ 'Regexp#match?' => -> { rx.match?(str) },
11
+ 'CharacterSet#cover?' => -> { cs.cover?(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum et dolorem'
16
+ rx = /[^a-z]/i
17
+ cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z')
18
+
19
+ benchmark(
20
+ caption: 'Detecting non-letters',
21
+ cases: {
22
+ 'Regexp#match?' => -> { rx.match?(str) },
23
+ 'CharacterSet#cover?' => -> { cs.cover?(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\s/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Removing whitespace',
9
+ cases: {
10
+ 'String#gsub' => -> { str.gsub(rx, '') },
11
+ 'CharacterSet#delete_in' => -> { cs.delete_in(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lörem ipsüm ⛷ et dölörem'
16
+ rx = /[\s\p{emoji}äüö]/
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
18
+
19
+ benchmark(
20
+ caption: 'Removing whitespace, emoji and umlauts',
21
+ cases: {
22
+ 'String#gsub' => -> { str.gsub(rx, '') },
23
+ 'CharacterSet#delete_in' => -> { cs.delete_in(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\S/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Removing non-whitespace',
9
+ cases: {
10
+ 'String#gsub' => -> { str.gsub(rx, '') },
11
+ 'CharacterSet#keep_in' => -> { cs.keep_in(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum ⛷ et dolorem'
16
+ rx = /\p{^emoji}/
17
+ cs = CharacterSet.emoji
18
+
19
+ benchmark(
20
+ caption: 'Extracting emoji',
21
+ cases: {
22
+ 'String#gsub' => -> { str.gsub(rx, '') },
23
+ 'CharacterSet#keep_in' => -> { cs.keep_in(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ lib = File.expand_path('../lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'benchmark/ips'
5
+ require 'character_set'
6
+
7
+ def benchmark(caption: nil, cases: {})
8
+ puts caption
9
+
10
+ report = Benchmark.ips do |x|
11
+ cases.each do |label, callable|
12
+ x.report(label, &callable)
13
+ end
14
+ x.compare!
15
+ end
16
+
17
+ return unless $store_comparison_results
18
+
19
+ old_stdout = $stdout.clone
20
+ captured_stdout = StringIO.new
21
+ $stdout = captured_stdout
22
+ report.run_comparison
23
+ $store_comparison_results[caption] = captured_stdout.string
24
+ $stdout = old_stdout
25
+ end
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\s/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Detecting whitespace',
9
+ cases: {
10
+ 'Regexp#match?' => -> { rx.match?(str) },
11
+ 'CharacterSet#used_by?' => -> { cs.used_by?(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20
16
+ rx = /\p{emoji}/
17
+ cs = CharacterSet.emoji
18
+
19
+ benchmark(
20
+ caption: 'Detecting emoji in a large string',
21
+ cases: {
22
+ 'Regexp#match?' => -> { rx.match?(str) },
23
+ 'CharacterSet#used_by?' => -> { cs.used_by?(str) },
24
+ }
25
+ )
data/bin/console ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+
5
+ require 'character_set'
6
+ require 'character_set/core_ext'
7
+ require 'character_set/pure'
8
+
9
+ require 'regexp_property_values'
10
+
11
+ CS = CharacterSet
12
+ CP = CharacterSet::Pure
13
+ PV = RegexpPropertyValues
14
+
15
+ require 'benchmark'
16
+ def m(&block); Benchmark.measure(&block); end
17
+
18
+ require "irb"
19
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'character_set/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'character_set'
8
+ s.version = CharacterSet::VERSION
9
+ s.authors = ['Janosch Müller']
10
+ s.email = ['janosch84@gmail.com']
11
+
12
+ s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
13
+ s.homepage = 'https://github.com/janosch-x/character_set'
14
+ s.license = 'MIT'
15
+
16
+ s.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ s.require_paths = ['lib']
20
+
21
+ s.extensions = %w[ext/character_set/extconf.rb]
22
+
23
+ s.required_ruby_version = '>= 2.1.0'
24
+
25
+ s.add_dependency 'range_compressor', '~> 1.0'
26
+
27
+ s.add_development_dependency 'benchmark-ips', '~> 2.7'
28
+ s.add_development_dependency 'bundler', '~> 1.16'
29
+ s.add_development_dependency 'rake', '~> 10.0'
30
+ s.add_development_dependency 'rake-compiler', '~> 1.0'
31
+ s.add_development_dependency 'regexp_parser', '~> 1.0'
32
+ s.add_development_dependency 'regexp_property_values', '~> 0.3.2'
33
+ s.add_development_dependency 'rspec', '~> 3.0'
34
+ end