character_set 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d2e4067480e00d5d03db2bbd1ee4f222f936e0f2
4
+ data.tar.gz: 0e4c0bc6cf393b1a81dc368ee86f94d0dea10a82
5
+ SHA512:
6
+ metadata.gz: d9150168393512190a496ed10af91a1eaa49eb2a01d3fb623de9586eb4fbd354dfea172bf6174ab180f6620ae6ca13a01f94ec26a95fbf118f48f611b4d7acd7
7
+ data.tar.gz: cb4b067fae5c8a550267a0dcef7708b30d36598b2ed18981711ad9b4a67b23cbf444270f7006d160e50f151ba32fe3402108429d415f7adbfb0be9160fedfda7
data/.gitignore ADDED
@@ -0,0 +1,31 @@
1
+ *.bundle
2
+ *.gem
3
+ *.iml
4
+ *.stTheme.cache
5
+ *.sublime-project
6
+ *.sublime-workspace
7
+ *.swp
8
+ *.tmlanguage.cache
9
+ *.tmPreferences.cache
10
+ *~
11
+ .byebug_history
12
+ .DS_Store
13
+ .idea/
14
+ .ruby-gemset
15
+ .ruby-version
16
+ .tags
17
+ .tags1
18
+ bbin/
19
+ binstubs/*
20
+ bundler_stubs/*/.yardoc
21
+ Gemfile.lock
22
+ /.bundle/
23
+ /_yardoc/
24
+ /coverage/
25
+ /doc/
26
+ /pkg/
27
+ /spec/reports/
28
+ /tmp/
29
+
30
+ # rspec failure tracking
31
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.1
5
+ - 2.4
6
+ - 2.5
7
+ - 2.6
8
+ - jruby-9.1.9.0
9
+ before_install:
10
+ - gem update --system
11
+ - gem install bundler
data/BENCHMARK.md ADDED
@@ -0,0 +1,50 @@
1
+ Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
2
+
3
+ ```
4
+ Detecting non-whitespace
5
+
6
+ CharacterSet#cover?: 13244577.7 i/s
7
+ Regexp#match?: 8027017.5 i/s - 1.65x slower
8
+ ```
9
+ ```
10
+ Detecting non-letters
11
+
12
+ CharacterSet#cover?: 13082940.8 i/s
13
+ Regexp#match?: 5372589.2 i/s - 2.44x slower
14
+ ```
15
+ ```
16
+ Removing whitespace
17
+
18
+ CharacterSet#delete_in: 389315.6 i/s
19
+ String#gsub: 223773.5 i/s - 1.74x slower
20
+ ```
21
+ ```
22
+ Removing whitespace, emoji and umlauts
23
+
24
+ CharacterSet#delete_in: 470239.3 i/s
25
+ String#gsub: 278679.4 i/s - 1.69x slower
26
+ ```
27
+ ```
28
+ Removing non-whitespace
29
+
30
+ CharacterSet#keep_in: 1138461.0 i/s
31
+ String#gsub: 235287.4 i/s - 4.84x slower
32
+ ```
33
+ ```
34
+ Extracting emoji
35
+
36
+ CharacterSet#keep_in: 1474472.0 i/s
37
+ String#gsub: 212269.6 i/s - 6.95x slower
38
+ ```
39
+ ```
40
+ Detecting whitespace
41
+
42
+ CharacterSet#used_by?: 13063108.7 i/s
43
+ Regexp#match?: 7215075.0 i/s - 1.81x slower
44
+ ```
45
+ ```
46
+ Detecting emoji in a large string
47
+
48
+ CharacterSet#used_by?: 246527.7 i/s
49
+ Regexp#match?: 92956.5 i/s - 2.65x slower
50
+ ```
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in character_set.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Janosch Müller
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,180 @@
1
+ # CharacterSet
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
+ [![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
5
+
6
+ A gem to build, read, write and compare sets of Unicode codepoints.
7
+
8
+ Many parts can be used independently, e.g.:
9
+ - `CharacterSet::Character`
10
+ - `CharacterSet::Parser`
11
+ - `CharacterSet::Writer`
12
+ - [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
13
+
14
+ ## Usage
15
+
16
+ ### Parse/Initialize
17
+
18
+ These all produce a `CharacterSet` containing `a`, `b` and `c`:
19
+
20
+ ```ruby
21
+ CharacterSet['a', 'b', 'c']
22
+ CharacterSet[97, 98, 99]
23
+ CharacterSet.new('a'..'c')
24
+ CharacterSet.new(0x61..0x63)
25
+ CharacterSet.of('abacababa')
26
+ CharacterSet.parse('[a-c]')
27
+ CharacterSet.parse('\U00000061-\U00000063')
28
+ ```
29
+
30
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting:
31
+
32
+ ```ruby
33
+ # are there any non-digit ascii chars classified as emoji?
34
+ set = CharacterSet.of_regexp(/[\D&&[:ascii:]&&\p{emoji}]/)
35
+
36
+ # ... of course there are!
37
+ set.to_a(stringify: true) # => ["#", "*"]
38
+
39
+ # with the core extension:
40
+ require 'character_set/core_ext/regexp_ext'
41
+ /[a-e&&[^c]]/.character_set # => CharacterSet['a', 'b', 'd', 'e']
42
+ ```
43
+
44
+ ### Common utility sets
45
+
46
+ ```ruby
47
+ CharacterSet.ascii
48
+ CharacterSet.bmp
49
+ CharacterSet.crypt
50
+ CharacterSet.emoji
51
+ CharacterSet.newline
52
+ CharacterSet.unicode
53
+ CharacterSet.url_fragment
54
+ CharacterSet.url_host
55
+ CharacterSet.url_path
56
+ CharacterSet.url_query
57
+ CharacterSet.whitespace
58
+
59
+ # e.g.
60
+ CharacterSet.url_query.cover?('?a=(b$c;)') # => true
61
+ CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"]
62
+
63
+ # all can be prefixed with `non_`, e.g.
64
+ CharacterSet.non_ascii.delete_in(string)
65
+ ```
66
+
67
+ ### Interact with Strings
68
+
69
+ CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
70
+
71
+ `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
72
+
73
+ ```ruby
74
+ CharacterSet.ascii.used_by?('Tüür') # => true
75
+ CharacterSet.ascii.cover?('Tüür') # => false
76
+ CharacterSet.ascii.cover?('Tr') # => true
77
+ ```
78
+
79
+ `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
80
+ ```ruby
81
+ string = 'Tüür'
82
+
83
+ CharacterSet.ascii.delete_in(string) # => 'üü'
84
+ CharacterSet.ascii.keep_in(string) # => 'Tr'
85
+ string # => 'Tüür'
86
+
87
+ CharacterSet.ascii.delete_in!(string) # => 'üü'
88
+ string # => 'üü'
89
+ CharacterSet.ascii.keep_in!(string) # => ''
90
+ string # => ''
91
+ ```
92
+
93
+ There is also a core extension for String interaction.
94
+ ```ruby
95
+ require 'character_set/core_ext/string_ext'
96
+
97
+ "a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"]
98
+ "a\rb".uses_character_set?(CharacterSet.emoji) # => false
99
+ "a\rb".covered_by_character_set?(CharacterSet.newline) # => false
100
+ "a\rb".delete_character_set(CharacterSet.newline) # => 'ab'
101
+ # etc.
102
+ ```
103
+
104
+ ### Manipulate
105
+
106
+ Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
107
+
108
+ Where appropriate, methods take both chars and codepoints, e.g.:
109
+
110
+ ```ruby
111
+ CharacterSet['a'].add('b') # => CharacterSet['a', 'b']
112
+ CharacterSet['a'].add(98) # => CharacterSet['a', 'b']
113
+ CharacterSet['a'].include?('a') # => true
114
+ CharacterSet['a'].include?(0x61) # => true
115
+ ```
116
+
117
+ `#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set:
118
+
119
+ ```ruby
120
+ non_a = CharacterSet['a'].inversion
121
+ # => #<CharacterSet (size: 1112063)>
122
+
123
+ non_a.include?('a') # => false
124
+ non_a.include?('ü') # => true
125
+
126
+ # surrogate pair halves are not included by default
127
+ CharacterSet['a'].inversion(include_surrogates: true)
128
+ # => #<CharacterSet (size: 1114111)>
129
+ ```
130
+
131
+ `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
132
+
133
+ ```ruby
134
+ CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
135
+ ```
136
+
137
+ ### Write
138
+ ```ruby
139
+ set = CharacterSet['a', 'b', 'c', 'j', '-']
140
+
141
+ # safely printable ASCII chars are not escaped by default
142
+ set.to_s # => 'a-cj\x2D'
143
+ set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D'
144
+
145
+ # brackets may be added
146
+ set.to_s(in_brackets: true) # => '[a-cj\x2D]'
147
+
148
+ # the default escape format is Ruby/ES6 compatible, others are available
149
+ set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩']
150
+ set.to_s # => 'a-c\u0258\u{1F929}'
151
+ set.to_s(format: 'U+') # => 'a-cU+0258U+1F929'
152
+ set.to_s(format: 'Python') # => "a-c\u0258\U0001F929"
153
+ set.to_s(format: 'raw') # => 'a-cɘ🤩'
154
+
155
+ # or pass a block
156
+ set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]"
157
+ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
158
+
159
+ # disable abbreviation (grouping of codepoints in ranges)
160
+ set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
161
+
162
+ # for full js regex compatibility in case of astral members:
163
+ set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
164
+ ```
165
+
166
+ ### Unicode plane methods
167
+
168
+ There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
169
+ ```Ruby
170
+ CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
171
+ CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
172
+ CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
173
+ CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
174
+ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
175
+ CharacterSet::Character.new('a').plane # => 0
176
+ ```
177
+
178
+ ### Contributions
179
+
180
+ Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile ADDED
@@ -0,0 +1,137 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubygems/package_task'
4
+ require 'rake/extensiontask'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
9
+
10
+ Rake::ExtensionTask.new('character_set') do |ext|
11
+ ext.lib_dir = 'lib/character_set'
12
+ end
13
+
14
+ namespace :java do
15
+ java_gemspec = eval File.read('./character_set.gemspec')
16
+ java_gemspec.platform = 'java'
17
+ java_gemspec.extensions = []
18
+
19
+ Gem::PackageTask.new(java_gemspec) do |pkg|
20
+ pkg.need_zip = true
21
+ pkg.need_tar = true
22
+ pkg.package_dir = 'pkg'
23
+ end
24
+ end
25
+
26
+ task package: 'java:gem'
27
+
28
+ desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
29
+ task :sync_ruby_spec do
30
+ require 'fileutils'
31
+
32
+ variants = {
33
+ 'CharacterSet' => './spec/ruby-spec/library/character_set',
34
+ 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
35
+ }
36
+ variants.each do |_, dir|
37
+ FileUtils.rm_rf(dir) if File.exist?(dir)
38
+ `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
39
+ end
40
+
41
+ base = variants.first[1]
42
+ variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
43
+
44
+ variants.each.with_index do |(class_name, dir), i|
45
+ Dir["#{dir}/**/*.rb"].each do |spec|
46
+ # remove some tests that do not apply or are covered otherwise
47
+ if spec =~ %r{/(flatten|initialize|pretty_print)}
48
+ File.delete(spec)
49
+ next
50
+ end
51
+
52
+ # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
53
+ # and `i` added to shared example names or they'll override each other
54
+ adapted_content =
55
+ File
56
+ .read(spec)
57
+ .gsub('SortedSet', class_name)
58
+ .gsub('sorted_set_', "sorted_set_#{i}_")
59
+ .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
60
+ .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
61
+ .gsub('"one"', '1')
62
+ .gsub('"two"', '2')
63
+ .gsub('"three"', '3')
64
+ .gsub('"four"', '4')
65
+ .gsub('"five"', '5')
66
+ .gsub('@method', 'method')
67
+ .gsub(/be_(false|true)/, 'be \1')
68
+ .gsub('mock', 'double')
69
+
70
+ File.open(spec, 'w') { |f| f.puts adapted_content }
71
+ end
72
+ end
73
+ end
74
+
75
+ desc 'Download unicode casefold data and write new C header file'
76
+ task :sync_casefold_data do
77
+ src_path = './CaseFolding.txt'
78
+ dst_path = './ext/character_set/unicode_casefold_table.h'
79
+
80
+ `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
81
+
82
+ mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
83
+ from, type, to = line.split(/\s*;\s*/).first(3)
84
+ # type 'C' stands for 'common', excludes mappings to multiple chars
85
+ hash[from] = to if type == 'C'
86
+ end.sort
87
+
88
+ File.open(dst_path, 'w') do |f|
89
+ f.puts <<-C
90
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
91
+
92
+ typedef struct casefold_mapping {
93
+ unsigned long from;
94
+ unsigned long to;
95
+ } casefold_mapping;
96
+
97
+ #define CASEFOLD_COUNT #{mapping.size}
98
+
99
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
100
+ C
101
+
102
+ mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
103
+
104
+ f.puts '};'
105
+ end
106
+
107
+ File.unlink(src_path)
108
+ end
109
+
110
+ desc 'Run all IPS benchmarks'
111
+ task :benchmark do
112
+ Dir['./benchmarks/*.rb'].sort.each { |file| require file }
113
+ end
114
+
115
+ namespace :benchmark do
116
+ desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
117
+ task :write_to_file do
118
+ $store_comparison_results = {}
119
+
120
+ Rake.application[:benchmark].invoke
121
+
122
+ File.open('BENCHMARK.md', 'w') do |f|
123
+ f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
124
+
125
+ $store_comparison_results.each do |caption, result|
126
+ f.puts '```', caption, '',
127
+ result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ unless RUBY_PLATFORM =~ /java/
134
+ # recompile before benchmarking or running specs
135
+ task(:benchmark).enhance([:compile])
136
+ task(:spec).enhance([:compile])
137
+ end
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\S/
5
+ cs = CharacterSet.whitespace.inversion
6
+
7
+ benchmark(
8
+ caption: 'Detecting non-whitespace',
9
+ cases: {
10
+ 'Regexp#match?' => -> { rx.match?(str) },
11
+ 'CharacterSet#cover?' => -> { cs.cover?(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum et dolorem'
16
+ rx = /[^a-z]/i
17
+ cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z')
18
+
19
+ benchmark(
20
+ caption: 'Detecting non-letters',
21
+ cases: {
22
+ 'Regexp#match?' => -> { rx.match?(str) },
23
+ 'CharacterSet#cover?' => -> { cs.cover?(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\s/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Removing whitespace',
9
+ cases: {
10
+ 'String#gsub' => -> { str.gsub(rx, '') },
11
+ 'CharacterSet#delete_in' => -> { cs.delete_in(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lörem ipsüm ⛷ et dölörem'
16
+ rx = /[\s\p{emoji}äüö]/
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
18
+
19
+ benchmark(
20
+ caption: 'Removing whitespace, emoji and umlauts',
21
+ cases: {
22
+ 'String#gsub' => -> { str.gsub(rx, '') },
23
+ 'CharacterSet#delete_in' => -> { cs.delete_in(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\S/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Removing non-whitespace',
9
+ cases: {
10
+ 'String#gsub' => -> { str.gsub(rx, '') },
11
+ 'CharacterSet#keep_in' => -> { cs.keep_in(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum ⛷ et dolorem'
16
+ rx = /\p{^emoji}/
17
+ cs = CharacterSet.emoji
18
+
19
+ benchmark(
20
+ caption: 'Extracting emoji',
21
+ cases: {
22
+ 'String#gsub' => -> { str.gsub(rx, '') },
23
+ 'CharacterSet#keep_in' => -> { cs.keep_in(str) },
24
+ }
25
+ )
@@ -0,0 +1,25 @@
1
+ lib = File.expand_path('../lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'benchmark/ips'
5
+ require 'character_set'
6
+
7
+ def benchmark(caption: nil, cases: {})
8
+ puts caption
9
+
10
+ report = Benchmark.ips do |x|
11
+ cases.each do |label, callable|
12
+ x.report(label, &callable)
13
+ end
14
+ x.compare!
15
+ end
16
+
17
+ return unless $store_comparison_results
18
+
19
+ old_stdout = $stdout.clone
20
+ captured_stdout = StringIO.new
21
+ $stdout = captured_stdout
22
+ report.run_comparison
23
+ $store_comparison_results[caption] = captured_stdout.string
24
+ $stdout = old_stdout
25
+ end
@@ -0,0 +1,25 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ rx = /\s/
5
+ cs = CharacterSet.whitespace
6
+
7
+ benchmark(
8
+ caption: 'Detecting whitespace',
9
+ cases: {
10
+ 'Regexp#match?' => -> { rx.match?(str) },
11
+ 'CharacterSet#used_by?' => -> { cs.used_by?(str) },
12
+ }
13
+ )
14
+
15
+ str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20
16
+ rx = /\p{emoji}/
17
+ cs = CharacterSet.emoji
18
+
19
+ benchmark(
20
+ caption: 'Detecting emoji in a large string',
21
+ cases: {
22
+ 'Regexp#match?' => -> { rx.match?(str) },
23
+ 'CharacterSet#used_by?' => -> { cs.used_by?(str) },
24
+ }
25
+ )
data/bin/console ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+
5
+ require 'character_set'
6
+ require 'character_set/core_ext'
7
+ require 'character_set/pure'
8
+
9
+ require 'regexp_property_values'
10
+
11
+ CS = CharacterSet
12
+ CP = CharacterSet::Pure
13
+ PV = RegexpPropertyValues
14
+
15
+ require 'benchmark'
16
+ def m(&block); Benchmark.measure(&block); end
17
+
18
+ require "irb"
19
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'character_set/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'character_set'
8
+ s.version = CharacterSet::VERSION
9
+ s.authors = ['Janosch Müller']
10
+ s.email = ['janosch84@gmail.com']
11
+
12
+ s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
13
+ s.homepage = 'https://github.com/janosch-x/character_set'
14
+ s.license = 'MIT'
15
+
16
+ s.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ s.require_paths = ['lib']
20
+
21
+ s.extensions = %w[ext/character_set/extconf.rb]
22
+
23
+ s.required_ruby_version = '>= 2.1.0'
24
+
25
+ s.add_dependency 'range_compressor', '~> 1.0'
26
+
27
+ s.add_development_dependency 'benchmark-ips', '~> 2.7'
28
+ s.add_development_dependency 'bundler', '~> 1.16'
29
+ s.add_development_dependency 'rake', '~> 10.0'
30
+ s.add_development_dependency 'rake-compiler', '~> 1.0'
31
+ s.add_development_dependency 'regexp_parser', '~> 1.0'
32
+ s.add_development_dependency 'regexp_property_values', '~> 0.3.2'
33
+ s.add_development_dependency 'rspec', '~> 3.0'
34
+ end