character_set 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: d2e4067480e00d5d03db2bbd1ee4f222f936e0f2
         | 
| 4 | 
            +
              data.tar.gz: 0e4c0bc6cf393b1a81dc368ee86f94d0dea10a82
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: d9150168393512190a496ed10af91a1eaa49eb2a01d3fb623de9586eb4fbd354dfea172bf6174ab180f6620ae6ca13a01f94ec26a95fbf118f48f611b4d7acd7
         | 
| 7 | 
            +
              data.tar.gz: cb4b067fae5c8a550267a0dcef7708b30d36598b2ed18981711ad9b4a67b23cbf444270f7006d160e50f151ba32fe3402108429d415f7adbfb0be9160fedfda7
         | 
    
        data/.gitignore
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
| 1 | 
            +
            *.bundle
         | 
| 2 | 
            +
            *.gem
         | 
| 3 | 
            +
            *.iml
         | 
| 4 | 
            +
            *.stTheme.cache
         | 
| 5 | 
            +
            *.sublime-project
         | 
| 6 | 
            +
            *.sublime-workspace
         | 
| 7 | 
            +
            *.swp
         | 
| 8 | 
            +
            *.tmlanguage.cache
         | 
| 9 | 
            +
            *.tmPreferences.cache
         | 
| 10 | 
            +
            *~
         | 
| 11 | 
            +
            .byebug_history
         | 
| 12 | 
            +
            .DS_Store
         | 
| 13 | 
            +
            .idea/
         | 
| 14 | 
            +
            .ruby-gemset
         | 
| 15 | 
            +
            .ruby-version
         | 
| 16 | 
            +
            .tags
         | 
| 17 | 
            +
            .tags1
         | 
| 18 | 
            +
            bbin/
         | 
| 19 | 
            +
            binstubs/*
         | 
| 20 | 
            +
            bundler_stubs/*/.yardoc
         | 
| 21 | 
            +
            Gemfile.lock
         | 
| 22 | 
            +
            /.bundle/
         | 
| 23 | 
            +
            /_yardoc/
         | 
| 24 | 
            +
            /coverage/
         | 
| 25 | 
            +
            /doc/
         | 
| 26 | 
            +
            /pkg/
         | 
| 27 | 
            +
            /spec/reports/
         | 
| 28 | 
            +
            /tmp/
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            # rspec failure tracking
         | 
| 31 | 
            +
            .rspec_status
         | 
    
        data/.rspec
    ADDED
    
    
    
        data/.travis.yml
    ADDED
    
    
    
        data/BENCHMARK.md
    ADDED
    
    | @@ -0,0 +1,50 @@ | |
| 1 | 
            +
            Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ```
         | 
| 4 | 
            +
            Detecting non-whitespace
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             CharacterSet#cover?: 13244577.7 i/s
         | 
| 7 | 
            +
                   Regexp#match?:  8027017.5 i/s - 1.65x  slower
         | 
| 8 | 
            +
            ```
         | 
| 9 | 
            +
            ```
         | 
| 10 | 
            +
            Detecting non-letters
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             CharacterSet#cover?: 13082940.8 i/s
         | 
| 13 | 
            +
                   Regexp#match?:  5372589.2 i/s - 2.44x  slower
         | 
| 14 | 
            +
            ```
         | 
| 15 | 
            +
            ```
         | 
| 16 | 
            +
            Removing whitespace
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            CharacterSet#delete_in:   389315.6 i/s
         | 
| 19 | 
            +
                     String#gsub:   223773.5 i/s - 1.74x  slower
         | 
| 20 | 
            +
            ```
         | 
| 21 | 
            +
            ```
         | 
| 22 | 
            +
            Removing whitespace, emoji and umlauts
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            CharacterSet#delete_in:   470239.3 i/s
         | 
| 25 | 
            +
                     String#gsub:   278679.4 i/s - 1.69x  slower
         | 
| 26 | 
            +
            ```
         | 
| 27 | 
            +
            ```
         | 
| 28 | 
            +
            Removing non-whitespace
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            CharacterSet#keep_in:  1138461.0 i/s
         | 
| 31 | 
            +
                     String#gsub:   235287.4 i/s - 4.84x  slower
         | 
| 32 | 
            +
            ```
         | 
| 33 | 
            +
            ```
         | 
| 34 | 
            +
            Extracting emoji
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            CharacterSet#keep_in:  1474472.0 i/s
         | 
| 37 | 
            +
                     String#gsub:   212269.6 i/s - 6.95x  slower
         | 
| 38 | 
            +
            ```
         | 
| 39 | 
            +
            ```
         | 
| 40 | 
            +
            Detecting whitespace
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            CharacterSet#used_by?: 13063108.7 i/s
         | 
| 43 | 
            +
                   Regexp#match?:  7215075.0 i/s - 1.81x  slower
         | 
| 44 | 
            +
            ```
         | 
| 45 | 
            +
            ```
         | 
| 46 | 
            +
            Detecting emoji in a large string
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            CharacterSet#used_by?:   246527.7 i/s
         | 
| 49 | 
            +
                   Regexp#match?:    92956.5 i/s - 2.65x  slower
         | 
| 50 | 
            +
            ```
         | 
    
        data/Gemfile
    ADDED
    
    
    
        data/LICENSE.txt
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
| 1 | 
            +
            The MIT License (MIT)
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2018 Janosch Müller
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in
         | 
| 13 | 
            +
            all copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
         | 
| 21 | 
            +
            THE SOFTWARE.
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,180 @@ | |
| 1 | 
            +
            # CharacterSet
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            [](http://badge.fury.io/rb/character_set)
         | 
| 4 | 
            +
            [](https://travis-ci.org/janosch-x/character_set)
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            A gem to build, read, write and compare sets of Unicode codepoints.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            Many parts can be used independently, e.g.:
         | 
| 9 | 
            +
            - `CharacterSet::Character`
         | 
| 10 | 
            +
            - `CharacterSet::Parser`
         | 
| 11 | 
            +
            - `CharacterSet::Writer`
         | 
| 12 | 
            +
            - [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            ## Usage
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            ### Parse/Initialize
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            These all produce a `CharacterSet` containing `a`, `b` and `c`:
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ```ruby
         | 
| 21 | 
            +
            CharacterSet['a', 'b', 'c']
         | 
| 22 | 
            +
            CharacterSet[97, 98, 99]
         | 
| 23 | 
            +
            CharacterSet.new('a'..'c')
         | 
| 24 | 
            +
            CharacterSet.new(0x61..0x63)
         | 
| 25 | 
            +
            CharacterSet.of('abacababa')
         | 
| 26 | 
            +
            CharacterSet.parse('[a-c]')
         | 
| 27 | 
            +
            CharacterSet.parse('\U00000061-\U00000063')
         | 
| 28 | 
            +
            ```
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting:
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            ```ruby
         | 
| 33 | 
            +
            # are there any non-digit ascii chars classified as emoji?
         | 
| 34 | 
            +
            set = CharacterSet.of_regexp(/[\D&&[:ascii:]&&\p{emoji}]/)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            # ... of course there are!
         | 
| 37 | 
            +
            set.to_a(stringify: true) # => ["#", "*"]
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            # with the core extension:
         | 
| 40 | 
            +
            require 'character_set/core_ext/regexp_ext'
         | 
| 41 | 
            +
            /[a-e&&[^c]]/.character_set # => CharacterSet['a', 'b', 'd', 'e']
         | 
| 42 | 
            +
            ```
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            ### Common utility sets
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ```ruby
         | 
| 47 | 
            +
            CharacterSet.ascii
         | 
| 48 | 
            +
            CharacterSet.bmp
         | 
| 49 | 
            +
            CharacterSet.crypt
         | 
| 50 | 
            +
            CharacterSet.emoji
         | 
| 51 | 
            +
            CharacterSet.newline
         | 
| 52 | 
            +
            CharacterSet.unicode
         | 
| 53 | 
            +
            CharacterSet.url_fragment
         | 
| 54 | 
            +
            CharacterSet.url_host
         | 
| 55 | 
            +
            CharacterSet.url_path
         | 
| 56 | 
            +
            CharacterSet.url_query
         | 
| 57 | 
            +
            CharacterSet.whitespace
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # e.g.
         | 
| 60 | 
            +
            CharacterSet.url_query.cover?('?a=(b$c;)') # => true
         | 
| 61 | 
            +
            CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"]
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            # all can be prefixed with `non_`, e.g.
         | 
| 64 | 
            +
            CharacterSet.non_ascii.delete_in(string)
         | 
| 65 | 
            +
            ```
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            ### Interact with Strings
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            ```ruby
         | 
| 74 | 
            +
            CharacterSet.ascii.used_by?('Tüür') # => true
         | 
| 75 | 
            +
            CharacterSet.ascii.cover?('Tüür') # => false
         | 
| 76 | 
            +
            CharacterSet.ascii.cover?('Tr') # => true
         | 
| 77 | 
            +
            ```
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
         | 
| 80 | 
            +
            ```ruby
         | 
| 81 | 
            +
            string = 'Tüür'
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            CharacterSet.ascii.delete_in(string) # => 'üü'
         | 
| 84 | 
            +
            CharacterSet.ascii.keep_in(string) # => 'Tr'
         | 
| 85 | 
            +
            string # => 'Tüür'
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            CharacterSet.ascii.delete_in!(string) # => 'üü'
         | 
| 88 | 
            +
            string # => 'üü'
         | 
| 89 | 
            +
            CharacterSet.ascii.keep_in!(string) # => ''
         | 
| 90 | 
            +
            string # => ''
         | 
| 91 | 
            +
            ```
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            There is also a core extension for String interaction.
         | 
| 94 | 
            +
            ```ruby
         | 
| 95 | 
            +
            require 'character_set/core_ext/string_ext'
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            "a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"]
         | 
| 98 | 
            +
            "a\rb".uses_character_set?(CharacterSet.emoji) # => false
         | 
| 99 | 
            +
            "a\rb".covered_by_character_set?(CharacterSet.newline) # => false
         | 
| 100 | 
            +
            "a\rb".delete_character_set(CharacterSet.newline) # => 'ab'
         | 
| 101 | 
            +
            # etc.
         | 
| 102 | 
            +
            ```
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            ### Manipulate
         | 
| 105 | 
            +
             | 
| 106 | 
            +
            Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            Where appropriate, methods take both chars and codepoints, e.g.:
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            ```ruby
         | 
| 111 | 
            +
            CharacterSet['a'].add('b') # => CharacterSet['a', 'b']
         | 
| 112 | 
            +
            CharacterSet['a'].add(98) # => CharacterSet['a', 'b']
         | 
| 113 | 
            +
            CharacterSet['a'].include?('a') # => true
         | 
| 114 | 
            +
            CharacterSet['a'].include?(0x61) # => true
         | 
| 115 | 
            +
            ```
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            `#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set:
         | 
| 118 | 
            +
             | 
| 119 | 
            +
            ```ruby
         | 
| 120 | 
            +
            non_a = CharacterSet['a'].inversion
         | 
| 121 | 
            +
            # => #<CharacterSet (size: 1112063)>
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            non_a.include?('a') # => false
         | 
| 124 | 
            +
            non_a.include?('ü') # => true
         | 
| 125 | 
            +
             | 
| 126 | 
            +
            # surrogate pair halves are not included by default
         | 
| 127 | 
            +
            CharacterSet['a'].inversion(include_surrogates: true)
         | 
| 128 | 
            +
            # => #<CharacterSet (size: 1114111)>
         | 
| 129 | 
            +
            ```
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            ```ruby
         | 
| 134 | 
            +
            CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
         | 
| 135 | 
            +
            ```
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            ### Write
         | 
| 138 | 
            +
            ```ruby
         | 
| 139 | 
            +
            set = CharacterSet['a', 'b', 'c', 'j', '-']
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            # safely printable ASCII chars are not escaped by default
         | 
| 142 | 
            +
            set.to_s # => 'a-cj\x2D'
         | 
| 143 | 
            +
            set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D'
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            # brackets may be added
         | 
| 146 | 
            +
            set.to_s(in_brackets: true) # => '[a-cj\x2D]'
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            # the default escape format is Ruby/ES6 compatible, others are available
         | 
| 149 | 
            +
            set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩']
         | 
| 150 | 
            +
            set.to_s # => 'a-c\u0258\u{1F929}'
         | 
| 151 | 
            +
            set.to_s(format: 'U+') # => 'a-cU+0258U+1F929'
         | 
| 152 | 
            +
            set.to_s(format: 'Python') # => "a-c\u0258\U0001F929"
         | 
| 153 | 
            +
            set.to_s(format: 'raw') # => 'a-cɘ🤩'
         | 
| 154 | 
            +
             | 
| 155 | 
            +
            # or pass a block
         | 
| 156 | 
            +
            set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]"
         | 
| 157 | 
            +
            set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            # disable abbreviation (grouping of codepoints in ranges)
         | 
| 160 | 
            +
            set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
         | 
| 161 | 
            +
             | 
| 162 | 
            +
            # for full js regex compatibility in case of astral members:
         | 
| 163 | 
            +
            set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
         | 
| 164 | 
            +
            ```
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            ### Unicode plane methods
         | 
| 167 | 
            +
             | 
| 168 | 
            +
            There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
         | 
| 169 | 
            +
            ```Ruby
         | 
| 170 | 
            +
            CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
         | 
| 171 | 
            +
            CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
         | 
| 172 | 
            +
            CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
         | 
| 173 | 
            +
            CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
         | 
| 174 | 
            +
            CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
         | 
| 175 | 
            +
            CharacterSet::Character.new('a').plane # => 0
         | 
| 176 | 
            +
            ```
         | 
| 177 | 
            +
             | 
| 178 | 
            +
            ### Contributions
         | 
| 179 | 
            +
             | 
| 180 | 
            +
            Feel free to send suggestions, point out issues, or submit pull requests.
         | 
    
        data/Rakefile
    ADDED
    
    | @@ -0,0 +1,137 @@ | |
| 1 | 
            +
            require 'bundler/gem_tasks'
         | 
| 2 | 
            +
            require 'rspec/core/rake_task'
         | 
| 3 | 
            +
            require 'rubygems/package_task'
         | 
| 4 | 
            +
            require 'rake/extensiontask'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            RSpec::Core::RakeTask.new(:spec)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            task default: :spec
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            Rake::ExtensionTask.new('character_set') do |ext|
         | 
| 11 | 
            +
              ext.lib_dir = 'lib/character_set'
         | 
| 12 | 
            +
            end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            namespace :java do
         | 
| 15 | 
            +
              java_gemspec = eval File.read('./character_set.gemspec')
         | 
| 16 | 
            +
              java_gemspec.platform = 'java'
         | 
| 17 | 
            +
              java_gemspec.extensions = []
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              Gem::PackageTask.new(java_gemspec) do |pkg|
         | 
| 20 | 
            +
                pkg.need_zip = true
         | 
| 21 | 
            +
                pkg.need_tar = true
         | 
| 22 | 
            +
                pkg.package_dir = 'pkg'
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
            end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            task package: 'java:gem'
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
         | 
| 29 | 
            +
            task :sync_ruby_spec do
         | 
| 30 | 
            +
              require 'fileutils'
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              variants = {
         | 
| 33 | 
            +
                'CharacterSet'       => './spec/ruby-spec/library/character_set',
         | 
| 34 | 
            +
                'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
         | 
| 35 | 
            +
              }
         | 
| 36 | 
            +
              variants.each do |_, dir|
         | 
| 37 | 
            +
                FileUtils.rm_rf(dir) if File.exist?(dir)
         | 
| 38 | 
            +
                `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
              base = variants.first[1]
         | 
| 42 | 
            +
              variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
         | 
| 43 | 
            +
             | 
| 44 | 
            +
              variants.each.with_index do |(class_name, dir), i|
         | 
| 45 | 
            +
                Dir["#{dir}/**/*.rb"].each do |spec|
         | 
| 46 | 
            +
                  # remove some tests that do not apply or are covered otherwise
         | 
| 47 | 
            +
                  if spec =~ %r{/(flatten|initialize|pretty_print)}
         | 
| 48 | 
            +
                    File.delete(spec)
         | 
| 49 | 
            +
                    next
         | 
| 50 | 
            +
                  end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                  # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
         | 
| 53 | 
            +
                  # and `i` added to shared example names or they'll override each other
         | 
| 54 | 
            +
                  adapted_content =
         | 
| 55 | 
            +
                    File
         | 
| 56 | 
            +
                    .read(spec)
         | 
| 57 | 
            +
                    .gsub('SortedSet', class_name)
         | 
| 58 | 
            +
                    .gsub('sorted_set_', "sorted_set_#{i}_")
         | 
| 59 | 
            +
                    .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
         | 
| 60 | 
            +
                    .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
         | 
| 61 | 
            +
                    .gsub('"one"', '1')
         | 
| 62 | 
            +
                    .gsub('"two"', '2')
         | 
| 63 | 
            +
                    .gsub('"three"', '3')
         | 
| 64 | 
            +
                    .gsub('"four"', '4')
         | 
| 65 | 
            +
                    .gsub('"five"', '5')
         | 
| 66 | 
            +
                    .gsub('@method', 'method')
         | 
| 67 | 
            +
                    .gsub(/be_(false|true)/, 'be \1')
         | 
| 68 | 
            +
                    .gsub('mock', 'double')
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  File.open(spec, 'w') { |f| f.puts adapted_content }
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
            end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            desc 'Download unicode casefold data and write new C header file'
         | 
| 76 | 
            +
            task :sync_casefold_data do
         | 
| 77 | 
            +
              src_path = './CaseFolding.txt'
         | 
| 78 | 
            +
              dst_path = './ext/character_set/unicode_casefold_table.h'
         | 
| 79 | 
            +
             | 
| 80 | 
            +
              `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
         | 
| 81 | 
            +
             | 
| 82 | 
            +
              mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
         | 
| 83 | 
            +
                from, type, to = line.split(/\s*;\s*/).first(3)
         | 
| 84 | 
            +
                # type 'C' stands for 'common', excludes mappings to multiple chars
         | 
| 85 | 
            +
                hash[from] = to if type == 'C'
         | 
| 86 | 
            +
              end.sort
         | 
| 87 | 
            +
             | 
| 88 | 
            +
              File.open(dst_path, 'w') do |f|
         | 
| 89 | 
            +
                f.puts <<-C
         | 
| 90 | 
            +
            // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            typedef struct casefold_mapping {
         | 
| 93 | 
            +
              unsigned long from;
         | 
| 94 | 
            +
              unsigned long to;
         | 
| 95 | 
            +
            } casefold_mapping;
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            #define CASEFOLD_COUNT #{mapping.size}
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
         | 
| 100 | 
            +
                C
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                f.puts '};'
         | 
| 105 | 
            +
              end
         | 
| 106 | 
            +
             | 
| 107 | 
            +
              File.unlink(src_path)
         | 
| 108 | 
            +
            end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            desc 'Run all IPS benchmarks'
         | 
| 111 | 
            +
            task :benchmark do
         | 
| 112 | 
            +
              Dir['./benchmarks/*.rb'].sort.each { |file| require file }
         | 
| 113 | 
            +
            end
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            namespace :benchmark do
         | 
| 116 | 
            +
              desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
         | 
| 117 | 
            +
              task :write_to_file do
         | 
| 118 | 
            +
                $store_comparison_results = {}
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                Rake.application[:benchmark].invoke
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                File.open('BENCHMARK.md', 'w') do |f|
         | 
| 123 | 
            +
                  f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                  $store_comparison_results.each do |caption, result|
         | 
| 126 | 
            +
                    f.puts '```', caption, '',
         | 
| 127 | 
            +
                           result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
         | 
| 128 | 
            +
                  end
         | 
| 129 | 
            +
                end
         | 
| 130 | 
            +
              end
         | 
| 131 | 
            +
            end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            unless RUBY_PLATFORM =~ /java/
         | 
| 134 | 
            +
              # recompile before benchmarking or running specs
         | 
| 135 | 
            +
              task(:benchmark).enhance([:compile])
         | 
| 136 | 
            +
              task(:spec).enhance([:compile])
         | 
| 137 | 
            +
            end
         | 
    
        data/benchmarks/cover.rb
    ADDED
    
    | @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem ipsum et dolorem'
         | 
| 4 | 
            +
            rx = /\S/
         | 
| 5 | 
            +
            cs = CharacterSet.whitespace.inversion
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Detecting non-whitespace',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'Regexp#match?'       => -> { rx.match?(str) },
         | 
| 11 | 
            +
                'CharacterSet#cover?' => -> { cs.cover?(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            str = 'Lorem ipsum et dolorem'
         | 
| 16 | 
            +
            rx = /[^a-z]/i
         | 
| 17 | 
            +
            cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z')
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            benchmark(
         | 
| 20 | 
            +
              caption: 'Detecting non-letters',
         | 
| 21 | 
            +
              cases: {
         | 
| 22 | 
            +
                'Regexp#match?'       => -> { rx.match?(str) },
         | 
| 23 | 
            +
                'CharacterSet#cover?' => -> { cs.cover?(str) },
         | 
| 24 | 
            +
              }
         | 
| 25 | 
            +
            )
         | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem     ipsum       et      dolorem'
         | 
| 4 | 
            +
            rx = /\s/
         | 
| 5 | 
            +
            cs = CharacterSet.whitespace
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Removing whitespace',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'String#gsub'            => -> { str.gsub(rx, '') },
         | 
| 11 | 
            +
                'CharacterSet#delete_in' => -> { cs.delete_in(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            str = 'Lörem ipsüm ⛷ et dölörem'
         | 
| 16 | 
            +
            rx = /[\s\p{emoji}äüö]/
         | 
| 17 | 
            +
            cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            benchmark(
         | 
| 20 | 
            +
              caption: 'Removing whitespace, emoji and umlauts',
         | 
| 21 | 
            +
              cases: {
         | 
| 22 | 
            +
                'String#gsub'            => -> { str.gsub(rx, '') },
         | 
| 23 | 
            +
                'CharacterSet#delete_in' => -> { cs.delete_in(str) },
         | 
| 24 | 
            +
              }
         | 
| 25 | 
            +
            )
         | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem ipsum et dolorem'
         | 
| 4 | 
            +
            rx = /\S/
         | 
| 5 | 
            +
            cs = CharacterSet.whitespace
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Removing non-whitespace',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'String#gsub'          => -> { str.gsub(rx, '') },
         | 
| 11 | 
            +
                'CharacterSet#keep_in' => -> { cs.keep_in(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            str = 'Lorem ipsum ⛷ et dolorem'
         | 
| 16 | 
            +
            rx = /\p{^emoji}/
         | 
| 17 | 
            +
            cs = CharacterSet.emoji
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            benchmark(
         | 
| 20 | 
            +
              caption: 'Extracting emoji',
         | 
| 21 | 
            +
              cases: {
         | 
| 22 | 
            +
                'String#gsub'          => -> { str.gsub(rx, '') },
         | 
| 23 | 
            +
                'CharacterSet#keep_in' => -> { cs.keep_in(str) },
         | 
| 24 | 
            +
              }
         | 
| 25 | 
            +
            )
         | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            lib = File.expand_path('../lib', __dir__)
         | 
| 2 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'benchmark/ips'
         | 
| 5 | 
            +
            require 'character_set'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            def benchmark(caption: nil, cases: {})
         | 
| 8 | 
            +
              puts caption
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              report = Benchmark.ips do |x|
         | 
| 11 | 
            +
                cases.each do |label, callable|
         | 
| 12 | 
            +
                  x.report(label, &callable)
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
                x.compare!
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              return unless $store_comparison_results
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              old_stdout = $stdout.clone
         | 
| 20 | 
            +
              captured_stdout = StringIO.new
         | 
| 21 | 
            +
              $stdout = captured_stdout
         | 
| 22 | 
            +
              report.run_comparison
         | 
| 23 | 
            +
              $store_comparison_results[caption] = captured_stdout.string
         | 
| 24 | 
            +
              $stdout = old_stdout
         | 
| 25 | 
            +
            end
         | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem ipsum et dolorem'
         | 
| 4 | 
            +
            rx = /\s/
         | 
| 5 | 
            +
            cs = CharacterSet.whitespace
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Detecting whitespace',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'Regexp#match?'         => -> { rx.match?(str) },
         | 
| 11 | 
            +
                'CharacterSet#used_by?' => -> { cs.used_by?(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20
         | 
| 16 | 
            +
            rx = /\p{emoji}/
         | 
| 17 | 
            +
            cs = CharacterSet.emoji
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            benchmark(
         | 
| 20 | 
            +
              caption: 'Detecting emoji in a large string',
         | 
| 21 | 
            +
              cases: {
         | 
| 22 | 
            +
                'Regexp#match?'         => -> { rx.match?(str) },
         | 
| 23 | 
            +
                'CharacterSet#used_by?' => -> { cs.used_by?(str) },
         | 
| 24 | 
            +
              }
         | 
| 25 | 
            +
            )
         | 
    
        data/bin/console
    ADDED
    
    | @@ -0,0 +1,19 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'bundler/setup'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            require 'character_set'
         | 
| 6 | 
            +
            require 'character_set/core_ext'
         | 
| 7 | 
            +
            require 'character_set/pure'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            require 'regexp_property_values'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            CS = CharacterSet
         | 
| 12 | 
            +
            CP = CharacterSet::Pure
         | 
| 13 | 
            +
            PV = RegexpPropertyValues
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            require 'benchmark'
         | 
| 16 | 
            +
            def m(&block); Benchmark.measure(&block); end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            require "irb"
         | 
| 19 | 
            +
            IRB.start(__FILE__)
         | 
    
        data/bin/setup
    ADDED
    
    
| @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            lib = File.expand_path('../lib', __FILE__)
         | 
| 2 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'character_set/version'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Gem::Specification.new do |s|
         | 
| 7 | 
            +
              s.name          = 'character_set'
         | 
| 8 | 
            +
              s.version       = CharacterSet::VERSION
         | 
| 9 | 
            +
              s.authors       = ['Janosch Müller']
         | 
| 10 | 
            +
              s.email         = ['janosch84@gmail.com']
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              s.summary       = 'Build, read, write and compare sets of Unicode codepoints.'
         | 
| 13 | 
            +
              s.homepage      = 'https://github.com/janosch-x/character_set'
         | 
| 14 | 
            +
              s.license       = 'MIT'
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              s.files         = `git ls-files -z`.split("\x0").reject do |f|
         | 
| 17 | 
            +
                f.match(%r{^(test|spec|features)/})
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
              s.require_paths = ['lib']
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              s.extensions  = %w[ext/character_set/extconf.rb]
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              s.required_ruby_version = '>= 2.1.0'
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              s.add_dependency 'range_compressor', '~> 1.0'
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              s.add_development_dependency 'benchmark-ips', '~> 2.7'
         | 
| 28 | 
            +
              s.add_development_dependency 'bundler', '~> 1.16'
         | 
| 29 | 
            +
              s.add_development_dependency 'rake', '~> 10.0'
         | 
| 30 | 
            +
              s.add_development_dependency 'rake-compiler', '~> 1.0'
         | 
| 31 | 
            +
              s.add_development_dependency 'regexp_parser', '~> 1.0'
         | 
| 32 | 
            +
              s.add_development_dependency 'regexp_property_values', '~> 0.3.2'
         | 
| 33 | 
            +
              s.add_development_dependency 'rspec', '~> 3.0'
         | 
| 34 | 
            +
            end
         |