character_set 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.travis.yml +1 -0
- data/BENCHMARK.md +51 -15
- data/CHANGELOG.md +20 -0
- data/README.md +24 -8
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +1 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +2 -0
- data/ext/character_set/character_set.c +963 -413
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/core_ext/string_ext.rb +2 -0
- data/lib/character_set/expression_converter.rb +21 -24
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +0 -2
- data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
- data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
- data/lib/character_set/shared_methods.rb +51 -40
- data/lib/character_set/version.rb +1 -1
- metadata +54 -3
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: ae7ec84b0727a804bf4d82564e6609fdd0bf070fd0e20c0a5688b579e320bc30
         | 
| 4 | 
            +
              data.tar.gz: b73dec9fbd4abf83fae5881de89e4e1876e48bcefc3ef935401d5adbeb9c6c8e
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 2b84916c89dcd6a234cc5acedfc604f664a9e285c92b3bae6bade748ad3d9c275fb3307fb5721142e52dbedc9b16da65285a8ebd87cd686b55391f222ef1b4f8
         | 
| 7 | 
            +
              data.tar.gz: 25147010da0adfd869891d50d51e265c2b4f28e1b0cb70727d9784b11c3944b9a06a9844a2068f529e487028c214f44e2ab60271a9a5730cdd40bb04dd989aaf
         | 
    
        data/.gitattributes
    ADDED
    
    
    
        data/.travis.yml
    CHANGED
    
    
    
        data/BENCHMARK.md
    CHANGED
    
    | @@ -1,46 +1,58 @@ | |
| 1 | 
            -
            Results of `rake:benchmark` on ruby 2.6. | 
| 1 | 
            +
            Results of `rake:benchmark` on ruby 2.6.2p47 (2019-03-13 revision 67232) [x86_64-darwin18]
         | 
| 2 2 |  | 
| 3 | 
            +
            ```
         | 
| 4 | 
            +
            Counting non-letters
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            CharacterSet#count_in: 12253693.8 i/s
         | 
| 7 | 
            +
                    String#count:  1737741.7 i/s - 7.05x  slower
         | 
| 8 | 
            +
            ```
         | 
| 3 9 | 
             
            ```
         | 
| 4 10 | 
             
            Detecting non-whitespace
         | 
| 5 11 |  | 
| 6 | 
            -
             CharacterSet#cover?:  | 
| 7 | 
            -
                   Regexp#match?:   | 
| 12 | 
            +
             CharacterSet#cover?: 14058351.9 i/s
         | 
| 13 | 
            +
                   Regexp#match?:  7907608.1 i/s - 1.78x  slower
         | 
| 8 14 | 
             
            ```
         | 
| 9 15 | 
             
            ```
         | 
| 10 16 | 
             
            Detecting non-letters
         | 
| 11 17 |  | 
| 12 | 
            -
             CharacterSet#cover?:  | 
| 13 | 
            -
                   Regexp#match?:   | 
| 18 | 
            +
             CharacterSet#cover?: 13341301.6 i/s
         | 
| 19 | 
            +
                   Regexp#match?:  5187453.3 i/s - 2.57x  slower
         | 
| 14 20 | 
             
            ```
         | 
| 15 21 | 
             
            ```
         | 
| 16 22 | 
             
            Removing whitespace
         | 
| 17 23 |  | 
| 18 | 
            -
            CharacterSet#delete_in: | 
| 19 | 
            -
                     String#gsub:    | 
| 24 | 
            +
            CharacterSet#delete_in:  2523184.0 i/s
         | 
| 25 | 
            +
                     String#gsub:   225804.7 i/s - 11.17x  slower
         | 
| 20 26 | 
             
            ```
         | 
| 21 27 | 
             
            ```
         | 
| 22 28 | 
             
            Removing whitespace, emoji and umlauts
         | 
| 23 29 |  | 
| 24 | 
            -
            CharacterSet#delete_in: | 
| 25 | 
            -
                     String#gsub:    | 
| 30 | 
            +
            CharacterSet#delete_in:  1712208.6 i/s
         | 
| 31 | 
            +
                     String#gsub:   278508.8 i/s - 6.15x  slower
         | 
| 26 32 | 
             
            ```
         | 
| 27 33 | 
             
            ```
         | 
| 28 34 | 
             
            Removing non-whitespace
         | 
| 29 35 |  | 
| 30 | 
            -
            CharacterSet#keep_in:   | 
| 31 | 
            -
                     String#gsub:    | 
| 36 | 
            +
            CharacterSet#keep_in:  2760158.1 i/s
         | 
| 37 | 
            +
                     String#gsub:   232797.7 i/s - 11.86x  slower
         | 
| 32 38 | 
             
            ```
         | 
| 33 39 | 
             
            ```
         | 
| 34 40 | 
             
            Extracting emoji
         | 
| 35 41 |  | 
| 36 | 
            -
            CharacterSet#keep_in:   | 
| 37 | 
            -
                     String#gsub:    | 
| 42 | 
            +
            CharacterSet#keep_in:  1775758.8 i/s
         | 
| 43 | 
            +
                     String#gsub:   217649.9 i/s - 8.16x  slower
         | 
| 44 | 
            +
            ```
         | 
| 45 | 
            +
            ```
         | 
| 46 | 
            +
            Extracting emoji to an Array
         | 
| 47 | 
            +
             | 
| 48 | 
            +
               CharacterSet#scan:  2579030.8 i/s
         | 
| 49 | 
            +
                     String#scan:   545107.0 i/s - 4.73x  slower
         | 
| 38 50 | 
             
            ```
         | 
| 39 51 | 
             
            ```
         | 
| 40 52 | 
             
            Detecting whitespace
         | 
| 41 53 |  | 
| 42 | 
            -
            CharacterSet#used_by?:  | 
| 43 | 
            -
                   Regexp#match?:   | 
| 54 | 
            +
            CharacterSet#used_by?: 13847689.0 i/s
         | 
| 55 | 
            +
                   Regexp#match?:  7533275.2 i/s - 1.84x  slower
         | 
| 44 56 | 
             
            ```
         | 
| 45 57 | 
             
            ```
         | 
| 46 58 | 
             
            Detecting emoji in a large string
         | 
| @@ -48,3 +60,27 @@ Detecting emoji in a large string | |
| 48 60 | 
             
            CharacterSet#used_by?:   246527.7 i/s
         | 
| 49 61 | 
             
                   Regexp#match?:    92956.5 i/s - 2.65x  slower
         | 
| 50 62 | 
             
            ```
         | 
| 63 | 
            +
            ```
         | 
| 64 | 
            +
            Adding entries
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                CharacterSet#add:  3102081.7 i/s
         | 
| 67 | 
            +
                   SortedSet#add:  1897464.8 i/s - 1.63x  slower
         | 
| 68 | 
            +
            ```
         | 
| 69 | 
            +
            ```
         | 
| 70 | 
            +
            Removing entries
         | 
| 71 | 
            +
             | 
| 72 | 
            +
             CharacterSet#delete:  3240924.1 i/s
         | 
| 73 | 
            +
                SortedSet#delete:  2887493.9 i/s - 1.12x  slower
         | 
| 74 | 
            +
            ```
         | 
| 75 | 
            +
            ```
         | 
| 76 | 
            +
            Merging entries
         | 
| 77 | 
            +
             | 
| 78 | 
            +
              CharacterSet#merge:      536.8 i/s
         | 
| 79 | 
            +
                 SortedSet#merge:       12.5 i/s - 42.78x  slower
         | 
| 80 | 
            +
            ```
         | 
| 81 | 
            +
            ```
         | 
| 82 | 
            +
            Getting the min and max
         | 
| 83 | 
            +
             | 
| 84 | 
            +
             CharacterSet#minmax:  4111960.8 i/s
         | 
| 85 | 
            +
                SortedSet#minmax:      756.4 i/s - 5436.39x  slower
         | 
| 86 | 
            +
            ```
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    | @@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file. | |
| 4 4 | 
             
            The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
         | 
| 5 5 | 
             
            and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
         | 
| 6 6 |  | 
| 7 | 
            +
            ## UNRELEASED
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            ## [1.3.0] - 2019-04-26
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            ### Added
         | 
| 12 | 
            +
            - improved `String` manipulation speed
         | 
| 13 | 
            +
            - improved initialization and `#merge` speed when passing a large `Range`
         | 
| 14 | 
            +
            - reduced memory consumption by > 90% for most use cases via dynamic resizing
         | 
| 15 | 
            +
              - before, every set instance required 136 KB for codepoints
         | 
| 16 | 
            +
              - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
         | 
| 17 | 
            +
            - `#count_in` and `#scan_in` methods for `String` interaction
         | 
| 18 | 
            +
            - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
         | 
| 19 | 
            +
            - conversion methods `#assigned_part`, `#valid_part`
         | 
| 20 | 
            +
            - sectioning methods `#ascii_part`, `#plane(n)`
         | 
| 21 | 
            +
            - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ### Fixed
         | 
| 24 | 
            +
            - `#count` now supports passing an argument or block as usual
         | 
| 25 | 
            +
            - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
         | 
| 26 | 
            +
             | 
| 7 27 | 
             
            ## [1.2.0] - 2019-04-02
         | 
| 8 28 |  | 
| 9 29 | 
             
            ### Added
         | 
    
        data/README.md
    CHANGED
    
    | @@ -2,8 +2,11 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            [](http://badge.fury.io/rb/character_set)
         | 
| 4 4 | 
             
            [](https://travis-ci.org/jaynetics/character_set)
         | 
| 5 | 
            +
            [](https://codecov.io/gh/jaynetics/character_set)
         | 
| 5 6 |  | 
| 6 | 
            -
             | 
| 7 | 
            +
            This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
         | 
| 7 10 |  | 
| 8 11 | 
             
            Many parts can be used independently, e.g.:
         | 
| 9 12 | 
             
            - `CharacterSet::Character`
         | 
| @@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext' | |
| 49 52 |  | 
| 50 53 | 
             
            ### Predefined utility sets
         | 
| 51 54 |  | 
| 52 | 
            -
            `ascii`, `ascii_alnum`, ` | 
| 55 | 
            +
            `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
         | 
| 53 56 |  | 
| 54 57 | 
             
            ```ruby
         | 
| 55 58 | 
             
            CharacterSet.ascii # => #<CharacterSet (size: 128)>
         | 
| @@ -60,7 +63,7 @@ CharacterSet.non_ascii | |
| 60 63 |  | 
| 61 64 | 
             
            ### Interact with Strings
         | 
| 62 65 |  | 
| 63 | 
            -
            CharacterSet can replace some ` | 
| 66 | 
            +
            `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
         | 
| 64 67 |  | 
| 65 68 | 
             
            `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
         | 
| 66 69 |  | 
| @@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true | |
| 71 74 | 
             
            ```
         | 
| 72 75 |  | 
| 73 76 | 
             
            `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
         | 
| 77 | 
            +
             | 
| 74 78 | 
             
            ```ruby
         | 
| 75 79 | 
             
            string = 'Tüür'
         | 
| 76 80 |  | 
| @@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => '' | |
| 84 88 | 
             
            string # => ''
         | 
| 85 89 | 
             
            ```
         | 
| 86 90 |  | 
| 91 | 
            +
            `#count_in` and `#scan` can replace `String#count` and `String#scan`:
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            ```ruby
         | 
| 94 | 
            +
            CharacterSet.non_ascii.count_in('Tüür') # => 2
         | 
| 95 | 
            +
            CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
         | 
| 96 | 
            +
            ```
         | 
| 97 | 
            +
             | 
| 87 98 | 
             
            There is also a core extension for String interaction.
         | 
| 88 99 | 
             
            ```ruby
         | 
| 89 100 | 
             
            require 'character_set/core_ext/string_ext'
         | 
| @@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext' | |
| 100 111 |  | 
| 101 112 | 
             
            ### Manipulate
         | 
| 102 113 |  | 
| 103 | 
            -
            Use any  | 
| 114 | 
            +
            Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
         | 
| 104 115 |  | 
| 105 116 | 
             
            Where appropriate, methods take both chars and codepoints, e.g.:
         | 
| 106 117 |  | 
| @@ -122,13 +133,13 @@ non_a.include?('ü') # => true | |
| 122 133 |  | 
| 123 134 | 
             
            # surrogate pair halves are not included by default
         | 
| 124 135 | 
             
            CharacterSet['a'].inversion(include_surrogates: true)
         | 
| 125 | 
            -
            # => #<CharacterSet (size:  | 
| 136 | 
            +
            # => #<CharacterSet (size: 1114112)>
         | 
| 126 137 | 
             
            ```
         | 
| 127 138 |  | 
| 128 139 | 
             
            `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
         | 
| 129 140 |  | 
| 130 141 | 
             
            ```ruby
         | 
| 131 | 
            -
            CharacterSet['1', ' | 
| 142 | 
            +
            CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
         | 
| 132 143 | 
             
            ```
         | 
| 133 144 |  | 
| 134 145 | 
             
            ### Write
         | 
| @@ -157,17 +168,22 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>" | |
| 157 168 | 
             
            set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
         | 
| 158 169 |  | 
| 159 170 | 
             
            # for full js regex compatibility in case of astral members:
         | 
| 160 | 
            -
            set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
         | 
| 171 | 
            +
            set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
         | 
| 161 172 | 
             
            ```
         | 
| 162 173 |  | 
| 163 174 | 
             
            ### Unicode plane methods
         | 
| 164 175 |  | 
| 165 | 
            -
            There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
         | 
| 176 | 
            +
            There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
         | 
| 166 177 | 
             
            ```Ruby
         | 
| 178 | 
            +
            CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
         | 
| 179 | 
            +
            CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
         | 
| 180 | 
            +
            CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
         | 
| 181 | 
            +
            CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
         | 
| 167 182 | 
             
            CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
         | 
| 168 183 | 
             
            CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
         | 
| 169 184 | 
             
            CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
         | 
| 170 185 | 
             
            CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
         | 
| 186 | 
            +
            CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
         | 
| 171 187 | 
             
            CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
         | 
| 172 188 | 
             
            CharacterSet::Character.new('a').plane # => 0
         | 
| 173 189 | 
             
            ```
         | 
    
        data/Rakefile
    CHANGED
    
    | @@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec) | |
| 7 7 |  | 
| 8 8 | 
             
            task default: :spec
         | 
| 9 9 |  | 
| 10 | 
            +
            namespace :spec do
         | 
| 11 | 
            +
              task :quick do
         | 
| 12 | 
            +
                ENV['SKIP_MEMSAFETY_SPECS'] = '1'
         | 
| 13 | 
            +
                Rake::Task[:spec].invoke
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
            end
         | 
| 16 | 
            +
             | 
| 10 17 | 
             
            Rake::ExtensionTask.new('character_set') do |ext|
         | 
| 11 18 | 
             
              ext.lib_dir = 'lib/character_set'
         | 
| 12 19 | 
             
            end
         | 
| @@ -106,27 +113,22 @@ task :sync_casefold_data do | |
| 106 113 | 
             
                hash[from] = to if type == 'C'
         | 
| 107 114 | 
             
              end.sort
         | 
| 108 115 |  | 
| 109 | 
            -
              File. | 
| 110 | 
            -
                 | 
| 111 | 
            -
             | 
| 112 | 
            -
            // -*-C-*-
         | 
| 113 | 
            -
             | 
| 114 | 
            -
            typedef struct casefold_mapping {
         | 
| 115 | 
            -
              unsigned long from;
         | 
| 116 | 
            -
              unsigned long to;
         | 
| 117 | 
            -
            } casefold_mapping;
         | 
| 118 | 
            -
             | 
| 119 | 
            -
            #define CASEFOLD_COUNT #{mapping.size}
         | 
| 116 | 
            +
              content = File.read(dst_path + '.tmpl')
         | 
| 117 | 
            +
                .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
         | 
| 118 | 
            +
                .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
         | 
| 120 119 |  | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 124 | 
            -
                mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
         | 
| 120 | 
            +
              File.write(dst_path, content)
         | 
| 121 | 
            +
              File.unlink(src_path)
         | 
| 122 | 
            +
            end
         | 
| 125 123 |  | 
| 126 | 
            -
             | 
| 124 | 
            +
            desc 'Update codepoint data for predefined sets, based on Onigmo'
         | 
| 125 | 
            +
            task :sync_predefined_sets do
         | 
| 126 | 
            +
              %w[assigned emoji whitespace].each do |prop|
         | 
| 127 | 
            +
                require 'regexp_property_values'
         | 
| 128 | 
            +
                ranges = RegexpPropertyValues[prop].matched_ranges
         | 
| 129 | 
            +
                str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
         | 
| 130 | 
            +
                File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
         | 
| 127 131 | 
             
              end
         | 
| 128 | 
            -
             | 
| 129 | 
            -
              File.unlink(src_path)
         | 
| 130 132 | 
             
            end
         | 
| 131 133 |  | 
| 132 134 | 
             
            desc 'Run all IPS benchmarks'
         | 
| @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem ipsum et dolorem'
         | 
| 4 | 
            +
            tr = '^A-Za-z'
         | 
| 5 | 
            +
            cs = CharacterSet.non_ascii_letter
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Counting non-letters',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'String#count'          => -> { str.count(tr) },
         | 
| 11 | 
            +
                'CharacterSet#count_in' => -> { cs.count_in(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
    
        data/benchmarks/delete_in.rb
    CHANGED
    
    | @@ -14,7 +14,7 @@ benchmark( | |
| 14 14 |  | 
| 15 15 | 
             
            str = 'Lörem ipsüm ⛷ et dölörem'
         | 
| 16 16 | 
             
            rx = /[\s\p{emoji}äüö]/
         | 
| 17 | 
            -
            cs = CharacterSet.whitespace + CharacterSet.emoji +  | 
| 17 | 
            +
            cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
         | 
| 18 18 |  | 
| 19 19 | 
             
            benchmark(
         | 
| 20 20 | 
             
              caption: 'Removing whitespace, emoji and umlauts',
         | 
    
        data/benchmarks/scan.rb
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            str = 'Lorem ipsum ⛷ et dolorem'
         | 
| 4 | 
            +
            rx = /\p{emoji}/
         | 
| 5 | 
            +
            cs = CharacterSet.emoji
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            benchmark(
         | 
| 8 | 
            +
              caption: 'Extracting emoji to an Array',
         | 
| 9 | 
            +
              cases: {
         | 
| 10 | 
            +
                'String#scan'       => -> { str.scan(rx) },
         | 
| 11 | 
            +
                'CharacterSet#scan' => -> { cs.scan(str) },
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
            )
         | 
    
        data/benchmarks/shared.rb
    CHANGED
    
    
    
        data/benchmarks/z_add.rb
    ADDED
    
    
| @@ -0,0 +1,12 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            cs = CharacterSet.new(0..0x10FFFF)
         | 
| 4 | 
            +
            ss = SortedSet.new(0..0x10FFFF)
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            benchmark(
         | 
| 7 | 
            +
              caption: 'Removing entries',
         | 
| 8 | 
            +
              cases: {
         | 
| 9 | 
            +
                'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
         | 
| 10 | 
            +
                'SortedSet#delete'    => -> { ss.delete(rand(0x10FFFF)) },
         | 
| 11 | 
            +
              }
         | 
| 12 | 
            +
            )
         | 
| @@ -0,0 +1,15 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            cs1 = CharacterSet.new(0...0x88000)
         | 
| 4 | 
            +
            cs2 = CharacterSet.new(0x88000..0x10FFFF)
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            ss1 = SortedSet.new(0...0x88000)
         | 
| 7 | 
            +
            ss2 = SortedSet.new(0x88000..0x10FFFF)
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            benchmark(
         | 
| 10 | 
            +
              caption: 'Merging entries',
         | 
| 11 | 
            +
              cases: {
         | 
| 12 | 
            +
                'CharacterSet#merge' => -> { cs1.merge(cs2) },
         | 
| 13 | 
            +
                'SortedSet#merge'    => -> { ss1.merge(ss2) },
         | 
| 14 | 
            +
              }
         | 
| 15 | 
            +
            )
         | 
| @@ -0,0 +1,12 @@ | |
| 1 | 
            +
            require_relative './shared'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            cs = CharacterSet.new(0..0xFFFF)
         | 
| 4 | 
            +
            ss = SortedSet.new(0..0xFFFF)
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            benchmark(
         | 
| 7 | 
            +
              caption: 'Getting the min and max',
         | 
| 8 | 
            +
              cases: {
         | 
| 9 | 
            +
                'CharacterSet#minmax' => -> { cs.minmax },
         | 
| 10 | 
            +
                'SortedSet#minmax'    => -> { ss.minmax },
         | 
| 11 | 
            +
              }
         | 
| 12 | 
            +
            )
         | 
    
        data/bin/console
    CHANGED
    
    
    
        data/character_set.gemspec
    CHANGED
    
    | @@ -23,6 +23,8 @@ Gem::Specification.new do |s| | |
| 23 23 | 
             
              s.required_ruby_version = '>= 2.1.0'
         | 
| 24 24 |  | 
| 25 25 | 
             
              s.add_development_dependency 'benchmark-ips', '~> 2.7'
         | 
| 26 | 
            +
              s.add_development_dependency 'codecov', '~> 0.1'
         | 
| 27 | 
            +
              s.add_development_dependency 'get_process_mem', '~> 0.2.3'
         | 
| 26 28 | 
             
              s.add_development_dependency 'rake', '~> 12.0'
         | 
| 27 29 | 
             
              s.add_development_dependency 'rake-compiler', '~> 1.0'
         | 
| 28 30 | 
             
              s.add_development_dependency 'range_compressor', '~> 1.0'
         | 
| @@ -2,81 +2,180 @@ | |
| 2 2 | 
             
            #include "ruby/encoding.h"
         | 
| 3 3 | 
             
            #include "unicode_casefold_table.h"
         | 
| 4 4 |  | 
| 5 | 
            -
            #define  | 
| 6 | 
            -
            #define  | 
| 7 | 
            -
            #define  | 
| 5 | 
            +
            #define UNICODE_PLANE_SIZE 0x10000
         | 
| 6 | 
            +
            #define UNICODE_PLANE_COUNT 17
         | 
| 7 | 
            +
            #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
         | 
| 8 8 |  | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 9 | 
            +
            // start at ascii size
         | 
| 10 | 
            +
            #define CS_DEFAULT_INITIAL_LEN 128
         | 
| 11 11 |  | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 12 | 
            +
            typedef char cs_ar;
         | 
| 13 | 
            +
            typedef unsigned long cs_cp;
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            struct cs_data
         | 
| 16 | 
            +
            {
         | 
| 17 | 
            +
              cs_ar *cps;
         | 
| 18 | 
            +
              cs_cp len;
         | 
| 19 | 
            +
            };
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            static inline void
         | 
| 24 | 
            +
            add_memspace_for_another_plane(struct cs_data *data)
         | 
| 25 | 
            +
            {
         | 
| 26 | 
            +
              data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
         | 
| 27 | 
            +
              memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
         | 
| 28 | 
            +
              data->len += UNICODE_PLANE_SIZE;
         | 
| 29 | 
            +
            }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            static inline void
         | 
| 32 | 
            +
            ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
         | 
| 33 | 
            +
            {
         | 
| 34 | 
            +
              while (target_cp >= data->len)
         | 
| 35 | 
            +
              {
         | 
| 36 | 
            +
                add_memspace_for_another_plane(data);
         | 
| 37 | 
            +
              }
         | 
| 38 | 
            +
            }
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            static inline void
         | 
| 41 | 
            +
            set_cp(struct cs_data *data, cs_cp cp)
         | 
| 42 | 
            +
            {
         | 
| 43 | 
            +
              ensure_memsize_fits(data, cp);
         | 
| 44 | 
            +
              data->cps[cp >> 3] |= (1 << (cp & 0x07));
         | 
| 45 | 
            +
            }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            static inline int
         | 
| 48 | 
            +
            tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
         | 
| 49 | 
            +
            {
         | 
| 50 | 
            +
              return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
         | 
| 51 | 
            +
            }
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            static inline void
         | 
| 54 | 
            +
            clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
         | 
| 55 | 
            +
            {
         | 
| 56 | 
            +
              if (cp < len)
         | 
| 57 | 
            +
              {
         | 
| 58 | 
            +
                cps[cp >> 3] &= ~(1 << (cp & 0x07));
         | 
| 59 | 
            +
              }
         | 
| 60 | 
            +
            }
         | 
| 16 61 |  | 
| 17 62 | 
             
            static void
         | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 63 | 
            +
            cs_free(void *ptr)
         | 
| 64 | 
            +
            {
         | 
| 65 | 
            +
              struct cs_data *data = ptr;
         | 
| 66 | 
            +
              ruby_xfree(data->cps);
         | 
| 67 | 
            +
              ruby_xfree(data);
         | 
| 20 68 | 
             
            }
         | 
| 21 69 |  | 
| 22 70 | 
             
            static size_t
         | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
                . | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 71 | 
            +
            cs_memsize(const void *ptr)
         | 
| 72 | 
            +
            {
         | 
| 73 | 
            +
              const struct cs_data *data = ptr;
         | 
| 74 | 
            +
              return sizeof(*data) + CS_MSIZE(data->len);
         | 
| 75 | 
            +
            }
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            static const rb_data_type_t cs_type = {
         | 
| 78 | 
            +
                .wrap_struct_name = "character_set",
         | 
| 79 | 
            +
                .function = {
         | 
| 80 | 
            +
                    .dmark = NULL,
         | 
| 81 | 
            +
                    .dfree = cs_free,
         | 
| 82 | 
            +
                    .dsize = cs_memsize,
         | 
| 83 | 
            +
                },
         | 
| 84 | 
            +
                .data = NULL,
         | 
| 85 | 
            +
                .flags = RUBY_TYPED_FREE_IMMEDIATELY,
         | 
| 37 86 | 
             
            };
         | 
| 38 87 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 88 | 
            +
            static inline VALUE
         | 
| 89 | 
            +
            cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
         | 
| 90 | 
            +
            {
         | 
| 91 | 
            +
              VALUE cs;
         | 
| 92 | 
            +
              struct cs_data *data;
         | 
| 93 | 
            +
              cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
         | 
| 94 | 
            +
              data->cps = ruby_xmalloc(CS_MSIZE(len));
         | 
| 95 | 
            +
              memset(data->cps, 0, CS_MSIZE(len));
         | 
| 96 | 
            +
              data->len = len;
         | 
| 97 | 
            +
             | 
| 98 | 
            +
              if (data_ptr)
         | 
| 99 | 
            +
              {
         | 
| 100 | 
            +
                *data_ptr = data;
         | 
| 101 | 
            +
              }
         | 
| 41 102 |  | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 103 | 
            +
              return cs;
         | 
| 104 | 
            +
            }
         | 
| 44 105 |  | 
| 45 | 
            -
            static VALUE
         | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
               | 
| 49 | 
            -
              return NEW_CHARACTER_SET(self, cp_arr);
         | 
| 106 | 
            +
            static inline VALUE
         | 
| 107 | 
            +
            cs_alloc(VALUE klass, struct cs_data **data_ptr)
         | 
| 108 | 
            +
            {
         | 
| 109 | 
            +
              return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
         | 
| 50 110 | 
             
            }
         | 
| 51 111 |  | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
               | 
| 56 | 
            -
               | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 112 | 
            +
            static inline struct cs_data *
         | 
| 113 | 
            +
            cs_fetch_data(VALUE cs)
         | 
| 114 | 
            +
            {
         | 
| 115 | 
            +
              struct cs_data *data;
         | 
| 116 | 
            +
              TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
         | 
| 117 | 
            +
              return data;
         | 
| 118 | 
            +
            }
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            static inline cs_ar *
         | 
| 121 | 
            +
            cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
         | 
| 122 | 
            +
            {
         | 
| 123 | 
            +
              struct cs_data *data;
         | 
| 124 | 
            +
              data = cs_fetch_data(cs);
         | 
| 125 | 
            +
              *len_ptr = data->len;
         | 
| 126 | 
            +
              return data->cps;
         | 
| 127 | 
            +
            }
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            static VALUE
         | 
| 130 | 
            +
            cs_method_allocate(VALUE self)
         | 
| 131 | 
            +
            {
         | 
| 132 | 
            +
              return cs_alloc(self, 0);
         | 
| 133 | 
            +
            }
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            #define FOR_EACH_ACTIVE_CODEPOINT(action) \
         | 
| 136 | 
            +
              do                                      \
         | 
| 137 | 
            +
              {                                       \
         | 
| 138 | 
            +
                cs_cp cp, len;                        \
         | 
| 139 | 
            +
                cs_ar *cps;                           \
         | 
| 140 | 
            +
                cps = cs_fetch_cps(self, &len);       \
         | 
| 141 | 
            +
                for (cp = 0; cp < len; cp++)          \
         | 
| 142 | 
            +
                {                                     \
         | 
| 143 | 
            +
                  if (tst_cp(cps, len, cp))           \
         | 
| 144 | 
            +
                  {                                   \
         | 
| 145 | 
            +
                    action;                           \
         | 
| 146 | 
            +
                  }                                   \
         | 
| 147 | 
            +
                }                                     \
         | 
| 148 | 
            +
              } while (0)
         | 
| 59 149 |  | 
| 60 150 | 
             
            // ***************************
         | 
| 61 151 | 
             
            // `Set` compatibility methods
         | 
| 62 152 | 
             
            // ***************************
         | 
| 63 153 |  | 
| 64 | 
            -
            static inline  | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 154 | 
            +
            static inline cs_cp
         | 
| 155 | 
            +
            cs_active_cp_count(VALUE self)
         | 
| 156 | 
            +
            {
         | 
| 157 | 
            +
              cs_cp count;
         | 
| 67 158 | 
             
              count = 0;
         | 
| 68 159 | 
             
              FOR_EACH_ACTIVE_CODEPOINT(count++);
         | 
| 69 | 
            -
              return  | 
| 160 | 
            +
              return count;
         | 
| 70 161 | 
             
            }
         | 
| 71 162 |  | 
| 72 163 | 
             
            static VALUE
         | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 164 | 
            +
            cs_method_length(VALUE self)
         | 
| 165 | 
            +
            {
         | 
| 166 | 
            +
              return LONG2FIX(cs_active_cp_count(self));
         | 
| 167 | 
            +
            }
         | 
| 168 | 
            +
             | 
| 169 | 
            +
            static inline VALUE
         | 
| 170 | 
            +
            cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
         | 
| 171 | 
            +
            {
         | 
| 172 | 
            +
              return LONG2FIX(cs_active_cp_count(self));
         | 
| 75 173 | 
             
            }
         | 
| 76 174 |  | 
| 77 175 | 
             
            static VALUE
         | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 176 | 
            +
            cs_method_each(VALUE self)
         | 
| 177 | 
            +
            {
         | 
| 178 | 
            +
              RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
         | 
| 80 179 | 
             
              FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
         | 
| 81 180 | 
             
              return self;
         | 
| 82 181 | 
             
            }
         | 
| @@ -84,16 +183,19 @@ method_each(VALUE self) { | |
| 84 183 | 
             
            // returns an Array of codepoint Integers by default.
         | 
| 85 184 | 
             
            // returns an Array of Strings of length 1 if passed `true`.
         | 
| 86 185 | 
             
            static VALUE
         | 
| 87 | 
            -
             | 
| 186 | 
            +
            cs_method_to_a(int argc, VALUE *argv, VALUE self)
         | 
| 187 | 
            +
            {
         | 
| 88 188 | 
             
              VALUE arr;
         | 
| 89 189 | 
             
              rb_encoding *enc;
         | 
| 90 190 | 
             
              rb_check_arity(argc, 0, 1);
         | 
| 91 191 |  | 
| 92 192 | 
             
              arr = rb_ary_new();
         | 
| 93 | 
            -
              if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) | 
| 193 | 
            +
              if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
         | 
| 194 | 
            +
              {
         | 
| 94 195 | 
             
                FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
         | 
| 95 196 | 
             
              }
         | 
| 96 | 
            -
              else | 
| 197 | 
            +
              else
         | 
| 198 | 
            +
              {
         | 
| 97 199 | 
             
                enc = rb_utf8_encoding();
         | 
| 98 200 | 
             
                FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
         | 
| 99 201 | 
             
              }
         | 
| @@ -102,302 +204,472 @@ method_to_a(int argc, VALUE *argv, VALUE self) { | |
| 102 204 | 
             
            }
         | 
| 103 205 |  | 
| 104 206 | 
             
            static VALUE
         | 
| 105 | 
            -
             | 
| 207 | 
            +
            cs_method_empty_p(VALUE self)
         | 
| 208 | 
            +
            {
         | 
| 106 209 | 
             
              FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
         | 
| 107 210 | 
             
              return Qtrue;
         | 
| 108 211 | 
             
            }
         | 
| 109 212 |  | 
| 110 213 | 
             
            static VALUE
         | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
               | 
| 114 | 
            -
               | 
| 214 | 
            +
            cs_method_hash(VALUE self)
         | 
| 215 | 
            +
            {
         | 
| 216 | 
            +
              cs_cp cp, len, hash, four_byte_value;
         | 
| 217 | 
            +
              cs_ar *cps;
         | 
| 218 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 115 219 |  | 
| 116 220 | 
             
              hash = 17;
         | 
| 117 | 
            -
              for (cp = 0; cp <  | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 221 | 
            +
              for (cp = 0; cp < len; cp++)
         | 
| 222 | 
            +
              {
         | 
| 223 | 
            +
                if (cp % 32 == 0)
         | 
| 224 | 
            +
                {
         | 
| 225 | 
            +
                  if (cp != 0)
         | 
| 226 | 
            +
                  {
         | 
| 227 | 
            +
                    hash = hash * 23 + four_byte_value;
         | 
| 228 | 
            +
                  }
         | 
| 120 229 | 
             
                  four_byte_value = 0;
         | 
| 121 230 | 
             
                }
         | 
| 122 | 
            -
                if ( | 
| 231 | 
            +
                if (tst_cp(cps, len, cp))
         | 
| 232 | 
            +
                {
         | 
| 233 | 
            +
                  four_byte_value++;
         | 
| 234 | 
            +
                }
         | 
| 123 235 | 
             
              }
         | 
| 124 236 |  | 
| 125 237 | 
             
              return LONG2FIX(hash);
         | 
| 126 238 | 
             
            }
         | 
| 127 239 |  | 
| 128 240 | 
             
            static inline VALUE
         | 
| 129 | 
            -
             | 
| 241 | 
            +
            cs_delete_if_block_result(VALUE self, int truthy)
         | 
| 242 | 
            +
            {
         | 
| 130 243 | 
             
              VALUE result;
         | 
| 131 244 | 
             
              rb_need_block();
         | 
| 132 245 | 
             
              rb_check_frozen(self);
         | 
| 133 246 | 
             
              FOR_EACH_ACTIVE_CODEPOINT(
         | 
| 134 | 
            -
             | 
| 135 | 
            -
             | 
| 136 | 
            -
              );
         | 
| 247 | 
            +
                  result = rb_yield(LONG2FIX(cp));
         | 
| 248 | 
            +
                  if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
         | 
| 137 249 | 
             
              return self;
         | 
| 138 250 | 
             
            }
         | 
| 139 251 |  | 
| 140 252 | 
             
            static VALUE
         | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
               | 
| 253 | 
            +
            cs_method_delete_if(VALUE self)
         | 
| 254 | 
            +
            {
         | 
| 255 | 
            +
              RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
         | 
| 256 | 
            +
              return cs_delete_if_block_result(self, 1);
         | 
| 144 257 | 
             
            }
         | 
| 145 258 |  | 
| 146 259 | 
             
            static VALUE
         | 
| 147 | 
            -
             | 
| 148 | 
            -
             | 
| 149 | 
            -
               | 
| 260 | 
            +
            cs_method_keep_if(VALUE self)
         | 
| 261 | 
            +
            {
         | 
| 262 | 
            +
              RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
         | 
| 263 | 
            +
              return cs_delete_if_block_result(self, 0);
         | 
| 150 264 | 
             
            }
         | 
| 151 265 |  | 
| 152 266 | 
             
            static VALUE
         | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
               | 
| 267 | 
            +
            cs_method_clear(VALUE self)
         | 
| 268 | 
            +
            {
         | 
| 269 | 
            +
              struct cs_data *data;
         | 
| 156 270 | 
             
              rb_check_frozen(self);
         | 
| 157 | 
            -
               | 
| 158 | 
            -
               | 
| 159 | 
            -
                CLRBIT(cps, cp);
         | 
| 160 | 
            -
              }
         | 
| 271 | 
            +
              data = cs_fetch_data(self);
         | 
| 272 | 
            +
              memset(data->cps, 0, CS_MSIZE(data->len));
         | 
| 161 273 | 
             
              return self;
         | 
| 162 274 | 
             
            }
         | 
| 163 275 |  | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
               | 
| 168 | 
            -
               | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 276 | 
            +
            static VALUE
         | 
| 277 | 
            +
            cs_method_min(VALUE self)
         | 
| 278 | 
            +
            {
         | 
| 279 | 
            +
              FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
         | 
| 280 | 
            +
              return Qnil;
         | 
| 281 | 
            +
            }
         | 
| 282 | 
            +
             | 
| 283 | 
            +
            static VALUE
         | 
| 284 | 
            +
            cs_method_max(VALUE self)
         | 
| 285 | 
            +
            {
         | 
| 286 | 
            +
              cs_cp len;
         | 
| 287 | 
            +
              long reverse_idx;
         | 
| 288 | 
            +
              cs_ar *cps;
         | 
| 289 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 290 | 
            +
              for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
         | 
| 291 | 
            +
              {
         | 
| 292 | 
            +
                if (tst_cp(cps, len, reverse_idx))
         | 
| 293 | 
            +
                {
         | 
| 294 | 
            +
                  return LONG2FIX(reverse_idx);
         | 
| 295 | 
            +
                }
         | 
| 296 | 
            +
              }
         | 
| 297 | 
            +
              return Qnil;
         | 
| 298 | 
            +
            }
         | 
| 299 | 
            +
             | 
| 300 | 
            +
            static VALUE
         | 
| 301 | 
            +
            cs_method_minmax(VALUE self)
         | 
| 302 | 
            +
            {
         | 
| 303 | 
            +
              VALUE arr;
         | 
| 304 | 
            +
              arr = rb_ary_new2(2);
         | 
| 305 | 
            +
              rb_ary_push(arr, cs_method_min(self));
         | 
| 306 | 
            +
              rb_ary_push(arr, cs_method_max(self));
         | 
| 307 | 
            +
              return arr;
         | 
| 308 | 
            +
            }
         | 
| 309 | 
            +
             | 
| 310 | 
            +
            #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op)                  \
         | 
| 311 | 
            +
              do                                                             \
         | 
| 312 | 
            +
              {                                                              \
         | 
| 313 | 
            +
                VALUE new_cs;                                                \
         | 
| 314 | 
            +
                cs_cp cp, alen, blen;                                        \
         | 
| 315 | 
            +
                cs_ar *acps, *bcps;                                          \
         | 
| 316 | 
            +
                struct cs_data *new_data;                                    \
         | 
| 317 | 
            +
                new_cs = cs_alloc(RBASIC(self)->klass, &new_data);           \
         | 
| 318 | 
            +
                acps = cs_fetch_cps(cs_a, &alen);                            \
         | 
| 319 | 
            +
                bcps = cs_fetch_cps(cs_b, &blen);                            \
         | 
| 320 | 
            +
                for (cp = 0; cp < UNICODE_CP_COUNT; cp++)                    \
         | 
| 321 | 
            +
                {                                                            \
         | 
| 322 | 
            +
                  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
         | 
| 323 | 
            +
                  {                                                          \
         | 
| 324 | 
            +
                    set_cp(new_data, cp);                                    \
         | 
| 325 | 
            +
                  }                                                          \
         | 
| 326 | 
            +
                }                                                            \
         | 
| 327 | 
            +
                return new_cs;                                               \
         | 
| 328 | 
            +
              } while (0)
         | 
| 174 329 |  | 
| 175 330 | 
             
            static VALUE
         | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 331 | 
            +
            cs_method_intersection(VALUE self, VALUE other)
         | 
| 332 | 
            +
            {
         | 
| 333 | 
            +
              RETURN_COMBINED_CS(self, other, &&);
         | 
| 178 334 | 
             
            }
         | 
| 179 335 |  | 
| 180 336 | 
             
            static VALUE
         | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 337 | 
            +
            cs_method_exclusion(VALUE self, VALUE other)
         | 
| 338 | 
            +
            {
         | 
| 339 | 
            +
              RETURN_COMBINED_CS(self, other, ^);
         | 
| 183 340 | 
             
            }
         | 
| 184 341 |  | 
| 185 342 | 
             
            static VALUE
         | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 343 | 
            +
            cs_method_union(VALUE self, VALUE other)
         | 
| 344 | 
            +
            {
         | 
| 345 | 
            +
              RETURN_COMBINED_CS(self, other, ||);
         | 
| 188 346 | 
             
            }
         | 
| 189 347 |  | 
| 190 348 | 
             
            static VALUE
         | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 349 | 
            +
            cs_method_difference(VALUE self, VALUE other)
         | 
| 350 | 
            +
            {
         | 
| 351 | 
            +
              RETURN_COMBINED_CS(self, other, >);
         | 
| 193 352 | 
             
            }
         | 
| 194 353 |  | 
| 195 354 | 
             
            static VALUE
         | 
| 196 | 
            -
             | 
| 197 | 
            -
             | 
| 198 | 
            -
               | 
| 199 | 
            -
               | 
| 355 | 
            +
            cs_method_include_p(VALUE self, VALUE num)
         | 
| 356 | 
            +
            {
         | 
| 357 | 
            +
              cs_ar *cps;
         | 
| 358 | 
            +
              cs_cp len;
         | 
| 359 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 360 | 
            +
              return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
         | 
| 200 361 | 
             
            }
         | 
| 201 362 |  | 
| 202 | 
            -
            static inline  | 
| 203 | 
            -
             | 
| 204 | 
            -
             | 
| 205 | 
            -
               | 
| 206 | 
            -
               | 
| 207 | 
            -
               | 
| 363 | 
            +
            static inline VALUE
         | 
| 364 | 
            +
            cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
         | 
| 365 | 
            +
            {
         | 
| 366 | 
            +
              cs_cp cp, len;
         | 
| 367 | 
            +
              cs_ar *cps;
         | 
| 368 | 
            +
              struct cs_data *data;
         | 
| 369 | 
            +
              rb_check_frozen(cs);
         | 
| 370 | 
            +
              data = cs_fetch_data(cs);
         | 
| 371 | 
            +
              cps = data->cps;
         | 
| 372 | 
            +
              len = data->len;
         | 
| 208 373 | 
             
              cp = FIX2ULONG(cp_num);
         | 
| 209 | 
            -
              if ( | 
| 210 | 
            -
             | 
| 374 | 
            +
              if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
         | 
| 375 | 
            +
              {
         | 
| 376 | 
            +
                return Qnil;
         | 
| 211 377 | 
             
              }
         | 
| 212 | 
            -
              else | 
| 213 | 
            -
             | 
| 214 | 
            -
                 | 
| 215 | 
            -
                 | 
| 378 | 
            +
              else
         | 
| 379 | 
            +
              {
         | 
| 380 | 
            +
                if (on)
         | 
| 381 | 
            +
                {
         | 
| 382 | 
            +
                  set_cp(data, cp);
         | 
| 383 | 
            +
                }
         | 
| 384 | 
            +
                else
         | 
| 385 | 
            +
                {
         | 
| 386 | 
            +
                  clr_cp(cps, len, cp);
         | 
| 387 | 
            +
                }
         | 
| 388 | 
            +
                return cs;
         | 
| 216 389 | 
             
              }
         | 
| 217 390 | 
             
            }
         | 
| 218 391 |  | 
| 219 392 | 
             
            static VALUE
         | 
| 220 | 
            -
             | 
| 221 | 
            -
             | 
| 393 | 
            +
            cs_method_add(VALUE self, VALUE cp_num)
         | 
| 394 | 
            +
            {
         | 
| 395 | 
            +
              return cs_toggle_codepoint(self, cp_num, 1, 0);
         | 
| 222 396 | 
             
            }
         | 
| 223 397 |  | 
| 224 398 | 
             
            static VALUE
         | 
| 225 | 
            -
             | 
| 226 | 
            -
             | 
| 399 | 
            +
            cs_method_add_p(VALUE self, VALUE cp_num)
         | 
| 400 | 
            +
            {
         | 
| 401 | 
            +
              return cs_toggle_codepoint(self, cp_num, 1, 1);
         | 
| 227 402 | 
             
            }
         | 
| 228 403 |  | 
| 229 404 | 
             
            static VALUE
         | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 405 | 
            +
            cs_method_delete(VALUE self, VALUE cp_num)
         | 
| 406 | 
            +
            {
         | 
| 407 | 
            +
              return cs_toggle_codepoint(self, cp_num, 0, 0);
         | 
| 232 408 | 
             
            }
         | 
| 233 409 |  | 
| 234 410 | 
             
            static VALUE
         | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 411 | 
            +
            cs_method_delete_p(VALUE self, VALUE cp_num)
         | 
| 412 | 
            +
            {
         | 
| 413 | 
            +
              return cs_toggle_codepoint(self, cp_num, 0, 1);
         | 
| 237 414 | 
             
            }
         | 
| 238 415 |  | 
| 239 | 
            -
            #define COMPARE_SETS(action)\
         | 
| 240 | 
            -
              cp_index cp;\
         | 
| 241 | 
            -
              cp_byte *cps, *other_cps;\
         | 
| 242 | 
            -
              FETCH_CODEPOINTS(self, cps);\
         | 
| 243 | 
            -
              FETCH_CODEPOINTS(other, other_cps);\
         | 
| 244 | 
            -
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
         | 
| 245 | 
            -
             | 
| 246 416 | 
             
            static VALUE
         | 
| 247 | 
            -
             | 
| 248 | 
            -
             | 
| 417 | 
            +
            cs_method_intersect_p(VALUE self, VALUE other)
         | 
| 418 | 
            +
            {
         | 
| 419 | 
            +
              cs_cp cp, alen, blen;
         | 
| 420 | 
            +
              cs_ar *acps, *bcps;
         | 
| 421 | 
            +
              acps = cs_fetch_cps(self, &alen);
         | 
| 422 | 
            +
              bcps = cs_fetch_cps(other, &blen);
         | 
| 423 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 424 | 
            +
              {
         | 
| 425 | 
            +
                if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
         | 
| 426 | 
            +
                {
         | 
| 427 | 
            +
                  return Qtrue;
         | 
| 428 | 
            +
                }
         | 
| 429 | 
            +
              }
         | 
| 249 430 | 
             
              return Qfalse;
         | 
| 250 431 | 
             
            }
         | 
| 251 432 |  | 
| 252 433 | 
             
            static VALUE
         | 
| 253 | 
            -
             | 
| 254 | 
            -
             | 
| 434 | 
            +
            cs_method_disjoint_p(VALUE self, VALUE other)
         | 
| 435 | 
            +
            {
         | 
| 436 | 
            +
              return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
         | 
| 255 437 | 
             
            }
         | 
| 256 438 |  | 
| 257 439 | 
             
            static inline int
         | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 440 | 
            +
            cs_check_type(VALUE obj)
         | 
| 441 | 
            +
            {
         | 
| 442 | 
            +
              return rb_typeddata_is_kind_of(obj, &cs_type);
         | 
| 260 443 | 
             
            }
         | 
| 261 444 |  | 
| 262 445 | 
             
            static VALUE
         | 
| 263 | 
            -
             | 
| 264 | 
            -
             | 
| 265 | 
            -
               | 
| 266 | 
            -
             | 
| 267 | 
            -
               | 
| 268 | 
            -
             | 
| 446 | 
            +
            cs_cps_eql(VALUE cs_a, VALUE cs_b)
         | 
| 447 | 
            +
            {
         | 
| 448 | 
            +
              cs_cp cp, alen, blen;
         | 
| 449 | 
            +
              cs_ar *acps, *bcps;
         | 
| 450 | 
            +
              acps = cs_fetch_cps(cs_a, &alen);
         | 
| 451 | 
            +
              bcps = cs_fetch_cps(cs_b, &blen);
         | 
| 452 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 453 | 
            +
              {
         | 
| 454 | 
            +
                if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
         | 
| 455 | 
            +
                {
         | 
| 456 | 
            +
                  return Qfalse;
         | 
| 457 | 
            +
                }
         | 
| 458 | 
            +
              }
         | 
| 269 459 | 
             
              return Qtrue;
         | 
| 270 460 | 
             
            }
         | 
| 271 461 |  | 
| 462 | 
            +
            static VALUE
         | 
| 463 | 
            +
            cs_method_eql_p(VALUE self, VALUE other)
         | 
| 464 | 
            +
            {
         | 
| 465 | 
            +
              if (!cs_check_type(other))
         | 
| 466 | 
            +
              {
         | 
| 467 | 
            +
                return Qfalse;
         | 
| 468 | 
            +
              }
         | 
| 469 | 
            +
              if (self == other) // same object_id
         | 
| 470 | 
            +
              {
         | 
| 471 | 
            +
                return Qtrue;
         | 
| 472 | 
            +
              }
         | 
| 473 | 
            +
              return cs_cps_eql(self, other);
         | 
| 474 | 
            +
            }
         | 
| 475 | 
            +
             | 
| 272 476 | 
             
            static inline VALUE
         | 
| 273 | 
            -
             | 
| 274 | 
            -
             | 
| 275 | 
            -
               | 
| 477 | 
            +
            cs_merge_cs(VALUE recipient, VALUE source)
         | 
| 478 | 
            +
            {
         | 
| 479 | 
            +
              cs_cp cp, source_len;
         | 
| 480 | 
            +
              struct cs_data *data;
         | 
| 481 | 
            +
              cs_ar *source_cps;
         | 
| 482 | 
            +
              data = cs_fetch_data(recipient);
         | 
| 483 | 
            +
              source_cps = cs_fetch_cps(source, &source_len);
         | 
| 484 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 485 | 
            +
              {
         | 
| 486 | 
            +
                if (tst_cp(source_cps, source_len, cp))
         | 
| 487 | 
            +
                {
         | 
| 488 | 
            +
                  set_cp(data, cp);
         | 
| 489 | 
            +
                }
         | 
| 490 | 
            +
              }
         | 
| 491 | 
            +
              return recipient;
         | 
| 276 492 | 
             
            }
         | 
| 277 493 |  | 
| 278 | 
            -
            static inline  | 
| 279 | 
            -
             | 
| 280 | 
            -
             | 
| 494 | 
            +
            static inline cs_cp
         | 
| 495 | 
            +
            cs_checked_cp(VALUE object_id)
         | 
| 496 | 
            +
            {
         | 
| 497 | 
            +
              if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
         | 
| 498 | 
            +
              {
         | 
| 499 | 
            +
                return FIX2ULONG(object_id);
         | 
| 500 | 
            +
              }
         | 
| 281 501 | 
             
              rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
         | 
| 282 502 | 
             
            }
         | 
| 283 503 |  | 
| 284 504 | 
             
            static inline VALUE
         | 
| 285 | 
            -
             | 
| 505 | 
            +
            cs_merge_rb_range(VALUE self, VALUE rb_range)
         | 
| 506 | 
            +
            {
         | 
| 286 507 | 
             
              VALUE from_id, upto_id;
         | 
| 508 | 
            +
              cs_cp from_cp, upto_cp, cont_len, rem;
         | 
| 287 509 | 
             
              int excl;
         | 
| 288 | 
            -
               | 
| 289 | 
            -
               | 
| 290 | 
            -
              FETCH_CODEPOINTS(self, cps);
         | 
| 510 | 
            +
              struct cs_data *data;
         | 
| 511 | 
            +
              data = cs_fetch_data(self);
         | 
| 291 512 |  | 
| 292 | 
            -
              if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) | 
| 513 | 
            +
              if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
         | 
| 514 | 
            +
              {
         | 
| 293 515 | 
             
                rb_raise(rb_eArgError, "pass a Range");
         | 
| 294 516 | 
             
              }
         | 
| 295 | 
            -
              if (excl) | 
| 517 | 
            +
              if (excl)
         | 
| 518 | 
            +
              {
         | 
| 519 | 
            +
                upto_id -= 2;
         | 
| 520 | 
            +
              }
         | 
| 521 | 
            +
             | 
| 522 | 
            +
              from_cp = cs_checked_cp(from_id);
         | 
| 523 | 
            +
              upto_cp = cs_checked_cp(upto_id);
         | 
| 296 524 |  | 
| 297 | 
            -
               | 
| 298 | 
            -
               | 
| 525 | 
            +
              if (upto_cp > from_cp && (upto_cp - from_cp > 6))
         | 
| 526 | 
            +
              {
         | 
| 527 | 
            +
                // set bits in preceding partially toggled bytes individually
         | 
| 528 | 
            +
                for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
         | 
| 529 | 
            +
                {
         | 
| 530 | 
            +
                  set_cp(data, from_cp);
         | 
| 531 | 
            +
                }
         | 
| 532 | 
            +
                // memset contiguous bits directly
         | 
| 533 | 
            +
                cont_len = upto_cp - from_cp + 1;
         | 
| 534 | 
            +
                rem = cont_len % 8;
         | 
| 535 | 
            +
                ensure_memsize_fits(data, upto_cp);
         | 
| 536 | 
            +
                memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
         | 
| 537 | 
            +
                from_cp = upto_cp - rem + 1;
         | 
| 538 | 
            +
              }
         | 
| 299 539 |  | 
| 300 | 
            -
               | 
| 301 | 
            -
             | 
| 302 | 
            -
             | 
| 540 | 
            +
              // set bits in partially toggled bytes individually
         | 
| 541 | 
            +
              for (/* */; from_cp <= upto_cp; from_cp++)
         | 
| 542 | 
            +
              {
         | 
| 543 | 
            +
                set_cp(data, from_cp);
         | 
| 303 544 | 
             
              }
         | 
| 545 | 
            +
             | 
| 304 546 | 
             
              return self;
         | 
| 305 547 | 
             
            }
         | 
| 306 548 |  | 
| 307 549 | 
             
            static inline VALUE
         | 
| 308 | 
            -
             | 
| 309 | 
            -
             | 
| 310 | 
            -
               | 
| 311 | 
            -
               | 
| 312 | 
            -
              FETCH_CODEPOINTS(self, cps);
         | 
| 550 | 
            +
            cs_merge_rb_array(VALUE self, VALUE rb_array)
         | 
| 551 | 
            +
            {
         | 
| 552 | 
            +
              VALUE el, array_length, i;
         | 
| 553 | 
            +
              struct cs_data *data;
         | 
| 313 554 | 
             
              Check_Type(rb_array, T_ARRAY);
         | 
| 555 | 
            +
              data = cs_fetch_data(self);
         | 
| 314 556 | 
             
              array_length = RARRAY_LEN(rb_array);
         | 
| 315 | 
            -
              for (i = 0; i < array_length; i++) | 
| 557 | 
            +
              for (i = 0; i < array_length; i++)
         | 
| 558 | 
            +
              {
         | 
| 316 559 | 
             
                el = RARRAY_AREF(rb_array, i);
         | 
| 317 | 
            -
                 | 
| 318 | 
            -
                SETBIT(cps, FIX2ULONG(el));
         | 
| 560 | 
            +
                set_cp(data, cs_checked_cp(el));
         | 
| 319 561 | 
             
              }
         | 
| 320 562 | 
             
              return self;
         | 
| 321 563 | 
             
            }
         | 
| 322 564 |  | 
| 323 565 | 
             
            static VALUE
         | 
| 324 | 
            -
             | 
| 566 | 
            +
            cs_method_merge(VALUE self, VALUE other)
         | 
| 567 | 
            +
            {
         | 
| 325 568 | 
             
              rb_check_frozen(self);
         | 
| 326 | 
            -
              if ( | 
| 327 | 
            -
             | 
| 569 | 
            +
              if (cs_check_type(other))
         | 
| 570 | 
            +
              {
         | 
| 571 | 
            +
                return cs_merge_cs(self, other);
         | 
| 328 572 | 
             
              }
         | 
| 329 | 
            -
              else if (TYPE(other) == T_ARRAY) | 
| 330 | 
            -
             | 
| 573 | 
            +
              else if (TYPE(other) == T_ARRAY)
         | 
| 574 | 
            +
              {
         | 
| 575 | 
            +
                return cs_merge_rb_array(self, other);
         | 
| 331 576 | 
             
              }
         | 
| 332 | 
            -
              return  | 
| 577 | 
            +
              return cs_merge_rb_range(self, other);
         | 
| 333 578 | 
             
            }
         | 
| 334 579 |  | 
| 335 580 | 
             
            static VALUE
         | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 | 
            -
               | 
| 581 | 
            +
            cs_method_initialize_copy(VALUE self, VALUE orig)
         | 
| 582 | 
            +
            {
         | 
| 583 | 
            +
              cs_merge_cs(self, orig);
         | 
| 584 | 
            +
              return self;
         | 
| 339 585 | 
             
            }
         | 
| 340 586 |  | 
| 341 587 | 
             
            static VALUE
         | 
| 342 | 
            -
             | 
| 588 | 
            +
            cs_method_subtract(VALUE self, VALUE other)
         | 
| 589 | 
            +
            {
         | 
| 590 | 
            +
              cs_cp cp, len, other_len;
         | 
| 591 | 
            +
              cs_ar *cps, *other_cps;
         | 
| 343 592 | 
             
              rb_check_frozen(self);
         | 
| 344 | 
            -
               | 
| 593 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 594 | 
            +
              other_cps = cs_fetch_cps(other, &other_len);
         | 
| 595 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 596 | 
            +
              {
         | 
| 597 | 
            +
                if (tst_cp(other_cps, other_len, cp))
         | 
| 598 | 
            +
                {
         | 
| 599 | 
            +
                  clr_cp(cps, len, cp);
         | 
| 600 | 
            +
                }
         | 
| 601 | 
            +
              }
         | 
| 345 602 | 
             
              return self;
         | 
| 346 603 | 
             
            }
         | 
| 347 604 |  | 
| 348 605 | 
             
            static inline int
         | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 | 
            -
               | 
| 606 | 
            +
            cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
         | 
| 607 | 
            +
            {
         | 
| 608 | 
            +
              cs_ar *a, *b;
         | 
| 609 | 
            +
              cs_cp cp, alen, blen, count_a, count_b;
         | 
| 352 610 |  | 
| 353 | 
            -
              if (! | 
| 611 | 
            +
              if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
         | 
| 612 | 
            +
              {
         | 
| 354 613 | 
             
                rb_raise(rb_eArgError, "pass a CharacterSet");
         | 
| 355 614 | 
             
              }
         | 
| 356 615 |  | 
| 357 | 
            -
               | 
| 358 | 
            -
               | 
| 359 | 
            -
             | 
| 360 | 
            -
               | 
| 361 | 
            -
               | 
| 362 | 
            -
             | 
| 363 | 
            -
             | 
| 364 | 
            -
               | 
| 365 | 
            -
                if ( | 
| 366 | 
            -
             | 
| 367 | 
            -
                   | 
| 368 | 
            -
                   | 
| 616 | 
            +
              a = cs_fetch_cps(cs_a, &alen);
         | 
| 617 | 
            +
              b = cs_fetch_cps(cs_b, &blen);
         | 
| 618 | 
            +
             | 
| 619 | 
            +
              count_a = 0;
         | 
| 620 | 
            +
              count_b = 0;
         | 
| 621 | 
            +
             | 
| 622 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 623 | 
            +
              {
         | 
| 624 | 
            +
                if (tst_cp(a, alen, cp))
         | 
| 625 | 
            +
                {
         | 
| 626 | 
            +
                  if (!tst_cp(b, blen, cp))
         | 
| 627 | 
            +
                  {
         | 
| 628 | 
            +
                    return 0;
         | 
| 629 | 
            +
                  }
         | 
| 630 | 
            +
                  count_a++;
         | 
| 631 | 
            +
                  count_b++;
         | 
| 632 | 
            +
                }
         | 
| 633 | 
            +
                else if (tst_cp(b, blen, cp))
         | 
| 634 | 
            +
                {
         | 
| 635 | 
            +
                  count_b++;
         | 
| 369 636 | 
             
                }
         | 
| 370 | 
            -
                else if (TSTBIT(cps_b, cp)) size_b++;
         | 
| 371 637 | 
             
              }
         | 
| 372 638 |  | 
| 373 | 
            -
              if ( | 
| 639 | 
            +
              if (is_proper_ptr)
         | 
| 640 | 
            +
              {
         | 
| 641 | 
            +
                *is_proper_ptr = count_b > count_a;
         | 
| 642 | 
            +
              }
         | 
| 643 | 
            +
             | 
| 374 644 | 
             
              return 1;
         | 
| 375 645 | 
             
            }
         | 
| 376 646 |  | 
| 377 647 | 
             
            static VALUE
         | 
| 378 | 
            -
             | 
| 379 | 
            -
             | 
| 380 | 
            -
              return  | 
| 648 | 
            +
            cs_method_subset_p(VALUE self, VALUE other)
         | 
| 649 | 
            +
            {
         | 
| 650 | 
            +
              return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
         | 
| 381 651 | 
             
            }
         | 
| 382 652 |  | 
| 383 653 | 
             
            static VALUE
         | 
| 384 | 
            -
             | 
| 385 | 
            -
             | 
| 386 | 
            -
               | 
| 387 | 
            -
               | 
| 654 | 
            +
            cs_method_proper_subset_p(VALUE self, VALUE other)
         | 
| 655 | 
            +
            {
         | 
| 656 | 
            +
              int is_subset, is_proper;
         | 
| 657 | 
            +
              is_subset = cs_a_subset_of_b(self, other, &is_proper);
         | 
| 658 | 
            +
              return (is_subset && is_proper) ? Qtrue : Qfalse;
         | 
| 388 659 | 
             
            }
         | 
| 389 660 |  | 
| 390 661 | 
             
            static VALUE
         | 
| 391 | 
            -
             | 
| 392 | 
            -
             | 
| 393 | 
            -
              return  | 
| 662 | 
            +
            cs_method_superset_p(VALUE self, VALUE other)
         | 
| 663 | 
            +
            {
         | 
| 664 | 
            +
              return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
         | 
| 394 665 | 
             
            }
         | 
| 395 666 |  | 
| 396 667 | 
             
            static VALUE
         | 
| 397 | 
            -
             | 
| 398 | 
            -
             | 
| 399 | 
            -
               | 
| 400 | 
            -
               | 
| 668 | 
            +
            cs_method_proper_superset_p(VALUE self, VALUE other)
         | 
| 669 | 
            +
            {
         | 
| 670 | 
            +
              int is_superset, is_proper;
         | 
| 671 | 
            +
              is_superset = cs_a_subset_of_b(other, self, &is_proper);
         | 
| 672 | 
            +
              return (is_superset && is_proper) ? Qtrue : Qfalse;
         | 
| 401 673 | 
             
            }
         | 
| 402 674 |  | 
| 403 675 | 
             
            // *******************************
         | 
| @@ -405,42 +677,43 @@ method_proper_superset_p(VALUE self, VALUE other) { | |
| 405 677 | 
             
            // *******************************
         | 
| 406 678 |  | 
| 407 679 | 
             
            static VALUE
         | 
| 408 | 
            -
             | 
| 409 | 
            -
             | 
| 410 | 
            -
               | 
| 680 | 
            +
            cs_class_method_from_ranges(VALUE self, VALUE ranges)
         | 
| 681 | 
            +
            {
         | 
| 682 | 
            +
              VALUE new_cs, range_count, i;
         | 
| 683 | 
            +
              new_cs = rb_class_new_instance(0, 0, self);
         | 
| 411 684 | 
             
              range_count = RARRAY_LEN(ranges);
         | 
| 412 | 
            -
              for (i = 0; i < range_count; i++) | 
| 413 | 
            -
             | 
| 685 | 
            +
              for (i = 0; i < range_count; i++)
         | 
| 686 | 
            +
              {
         | 
| 687 | 
            +
                cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
         | 
| 414 688 | 
             
              }
         | 
| 415 | 
            -
              return  | 
| 689 | 
            +
              return new_cs;
         | 
| 416 690 | 
             
            }
         | 
| 417 691 |  | 
| 418 692 | 
             
            static VALUE
         | 
| 419 | 
            -
             | 
| 420 | 
            -
             | 
| 693 | 
            +
            cs_method_ranges(VALUE self)
         | 
| 694 | 
            +
            {
         | 
| 695 | 
            +
              VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
         | 
| 421 696 |  | 
| 422 697 | 
             
              ranges = rb_ary_new();
         | 
| 423 | 
            -
               | 
| 698 | 
            +
              previous_cp_num = 0;
         | 
| 424 699 | 
             
              current_start = 0;
         | 
| 425 700 | 
             
              current_end = 0;
         | 
| 426 701 |  | 
| 427 702 | 
             
              FOR_EACH_ACTIVE_CODEPOINT(
         | 
| 428 | 
            -
             | 
| 703 | 
            +
                  cp_num = LONG2FIX(cp);
         | 
| 429 704 |  | 
| 430 | 
            -
             | 
| 431 | 
            -
             | 
| 432 | 
            -
             | 
| 433 | 
            -
             | 
| 434 | 
            -
             | 
| 435 | 
            -
             | 
| 436 | 
            -
                   | 
| 437 | 
            -
             | 
| 438 | 
            -
                current_end = codepoint;
         | 
| 439 | 
            -
                previous_codepoint = codepoint;
         | 
| 440 | 
            -
              );
         | 
| 705 | 
            +
                  if (!previous_cp_num) {
         | 
| 706 | 
            +
                    current_start = cp_num;
         | 
| 707 | 
            +
                  } else if (previous_cp_num + 2 != cp_num) {
         | 
| 708 | 
            +
                    // gap found, finalize previous range
         | 
| 709 | 
            +
                    rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
         | 
| 710 | 
            +
                    current_start = cp_num;
         | 
| 711 | 
            +
                  } current_end = cp_num;
         | 
| 712 | 
            +
                  previous_cp_num = cp_num;);
         | 
| 441 713 |  | 
| 442 714 | 
             
              // add final range
         | 
| 443 | 
            -
              if (current_start) | 
| 715 | 
            +
              if (current_start)
         | 
| 716 | 
            +
              {
         | 
| 444 717 | 
             
                rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
         | 
| 445 718 | 
             
              }
         | 
| 446 719 |  | 
| @@ -448,117 +721,233 @@ method_ranges(VALUE self) { | |
| 448 721 | 
             
            }
         | 
| 449 722 |  | 
| 450 723 | 
             
            static VALUE
         | 
| 451 | 
            -
             | 
| 452 | 
            -
             | 
| 724 | 
            +
            cs_method_sample(int argc, VALUE *argv, VALUE self)
         | 
| 725 | 
            +
            {
         | 
| 726 | 
            +
              VALUE array, to_a_args[1] = {Qtrue};
         | 
| 453 727 | 
             
              rb_check_arity(argc, 0, 1);
         | 
| 454 | 
            -
               | 
| 455 | 
            -
              array = method_to_a(1, to_a_args, self);
         | 
| 728 | 
            +
              array = cs_method_to_a(1, to_a_args, self);
         | 
| 456 729 | 
             
              return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
         | 
| 457 730 | 
             
            }
         | 
| 458 731 |  | 
| 459 732 | 
             
            static inline VALUE
         | 
| 460 | 
            -
             | 
| 461 | 
            -
             | 
| 462 | 
            -
               | 
| 463 | 
            -
               | 
| 464 | 
            -
               | 
| 465 | 
            -
               | 
| 466 | 
            -
             | 
| 733 | 
            +
            cs_from_section(VALUE set, cs_cp from, cs_cp upto)
         | 
| 734 | 
            +
            {
         | 
| 735 | 
            +
              VALUE new_cs;
         | 
| 736 | 
            +
              cs_ar *cps;
         | 
| 737 | 
            +
              cs_cp cp, len;
         | 
| 738 | 
            +
              struct cs_data *new_data;
         | 
| 739 | 
            +
              new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
         | 
| 740 | 
            +
              cps = cs_fetch_cps(set, &len);
         | 
| 741 | 
            +
              for (cp = from; cp <= upto; cp++)
         | 
| 742 | 
            +
              {
         | 
| 743 | 
            +
                if (tst_cp(cps, len, cp))
         | 
| 744 | 
            +
                {
         | 
| 745 | 
            +
                  set_cp(new_data, cp);
         | 
| 746 | 
            +
                }
         | 
| 467 747 | 
             
              }
         | 
| 468 | 
            -
              return  | 
| 748 | 
            +
              return new_cs;
         | 
| 469 749 | 
             
            }
         | 
| 470 750 |  | 
| 471 751 | 
             
            static VALUE
         | 
| 472 | 
            -
             | 
| 473 | 
            -
             | 
| 752 | 
            +
            cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
         | 
| 753 | 
            +
            {
         | 
| 754 | 
            +
              return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
         | 
| 755 | 
            +
            }
         | 
| 756 | 
            +
             | 
| 757 | 
            +
            static inline cs_cp
         | 
| 758 | 
            +
            cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
         | 
| 759 | 
            +
            {
         | 
| 760 | 
            +
              cs_ar *cps;
         | 
| 761 | 
            +
              cs_cp cp, count, len;
         | 
| 762 | 
            +
              cps = cs_fetch_cps(set, &len);
         | 
| 763 | 
            +
              for (count = 0, cp = from; cp <= upto; cp++)
         | 
| 764 | 
            +
              {
         | 
| 765 | 
            +
                if (tst_cp(cps, len, cp))
         | 
| 766 | 
            +
                {
         | 
| 767 | 
            +
                  count++;
         | 
| 768 | 
            +
                }
         | 
| 769 | 
            +
              }
         | 
| 770 | 
            +
              return count;
         | 
| 474 771 | 
             
            }
         | 
| 475 772 |  | 
| 476 773 | 
             
            static VALUE
         | 
| 477 | 
            -
             | 
| 478 | 
            -
             | 
| 774 | 
            +
            cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
         | 
| 775 | 
            +
            {
         | 
| 776 | 
            +
              cs_cp count;
         | 
| 777 | 
            +
              count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
         | 
| 778 | 
            +
              return LONG2FIX(count);
         | 
| 479 779 | 
             
            }
         | 
| 480 780 |  | 
| 481 781 | 
             
            static inline VALUE
         | 
| 482 | 
            -
             | 
| 483 | 
            -
             | 
| 484 | 
            -
               | 
| 485 | 
            -
               | 
| 486 | 
            -
               | 
| 487 | 
            -
             | 
| 488 | 
            -
             | 
| 489 | 
            -
             | 
| 782 | 
            +
            cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
         | 
| 783 | 
            +
            {
         | 
| 784 | 
            +
              cs_cp cp;
         | 
| 785 | 
            +
              for (cp = from; cp <= upto; cp++)
         | 
| 786 | 
            +
              {
         | 
| 787 | 
            +
                if (tst_cp(cps, len, cp))
         | 
| 788 | 
            +
                {
         | 
| 789 | 
            +
                  return Qtrue;
         | 
| 790 | 
            +
                }
         | 
| 490 791 | 
             
              }
         | 
| 491 792 | 
             
              return Qfalse;
         | 
| 492 793 | 
             
            }
         | 
| 493 794 |  | 
| 494 795 | 
             
            static VALUE
         | 
| 495 | 
            -
             | 
| 796 | 
            +
            cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
         | 
| 797 | 
            +
            {
         | 
| 798 | 
            +
              cs_ar *cps;
         | 
| 799 | 
            +
              cs_cp len;
         | 
| 800 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 801 | 
            +
              return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
         | 
| 802 | 
            +
            }
         | 
| 803 | 
            +
             | 
| 804 | 
            +
            static inline VALUE
         | 
| 805 | 
            +
            cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
         | 
| 806 | 
            +
            {
         | 
| 807 | 
            +
              double section_count, total_count;
         | 
| 808 | 
            +
              section_count = (double)cs_active_cp_count_in_section(set, from, upto);
         | 
| 809 | 
            +
              total_count = (double)cs_active_cp_count(set);
         | 
| 810 | 
            +
              return DBL2NUM(section_count / total_count);
         | 
| 811 | 
            +
            }
         | 
| 812 | 
            +
             | 
| 813 | 
            +
            static VALUE
         | 
| 814 | 
            +
            cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
         | 
| 815 | 
            +
            {
         | 
| 816 | 
            +
              return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
         | 
| 817 | 
            +
            }
         | 
| 818 | 
            +
             | 
| 819 | 
            +
            #define MAX_CP 0x10FFFF
         | 
| 820 | 
            +
            #define MAX_ASCII_CP 0x7F
         | 
| 821 | 
            +
            #define MAX_BMP_CP 0xFFFF
         | 
| 822 | 
            +
            #define MIN_ASTRAL_CP 0x10000
         | 
| 823 | 
            +
             | 
| 824 | 
            +
            static inline VALUE
         | 
| 825 | 
            +
            cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
         | 
| 826 | 
            +
            {
         | 
| 827 | 
            +
              cs_cp plane_beg, plane_end;
         | 
| 828 | 
            +
              plane_beg = plane * UNICODE_PLANE_SIZE;
         | 
| 829 | 
            +
              plane_end = (plane + 1) * MAX_BMP_CP;
         | 
| 830 | 
            +
              return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
         | 
| 831 | 
            +
            }
         | 
| 832 | 
            +
             | 
| 833 | 
            +
            static VALUE
         | 
| 834 | 
            +
            cs_method_planes(VALUE self)
         | 
| 835 | 
            +
            {
         | 
| 836 | 
            +
              cs_ar *cps;
         | 
| 837 | 
            +
              cs_cp len;
         | 
| 496 838 | 
             
              unsigned int i;
         | 
| 497 839 | 
             
              VALUE planes;
         | 
| 840 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 498 841 | 
             
              planes = rb_ary_new();
         | 
| 499 | 
            -
              for (i = 0; i < UNICODE_PLANE_COUNT; i++) | 
| 500 | 
            -
             | 
| 842 | 
            +
              for (i = 0; i < UNICODE_PLANE_COUNT; i++)
         | 
| 843 | 
            +
              {
         | 
| 844 | 
            +
                if (cs_has_cp_in_plane(cps, len, i))
         | 
| 845 | 
            +
                {
         | 
| 846 | 
            +
                  rb_ary_push(planes, INT2FIX(i));
         | 
| 847 | 
            +
                }
         | 
| 501 848 | 
             
              }
         | 
| 502 849 | 
             
              return planes;
         | 
| 503 850 | 
             
            }
         | 
| 504 851 |  | 
| 505 | 
            -
            static  | 
| 506 | 
            -
             | 
| 852 | 
            +
            static inline int
         | 
| 853 | 
            +
            cs_valid_plane_num(VALUE num)
         | 
| 854 | 
            +
            {
         | 
| 507 855 | 
             
              int plane;
         | 
| 508 | 
            -
              Check_Type( | 
| 509 | 
            -
              plane = FIX2INT( | 
| 510 | 
            -
              if (plane < 0 || plane >= UNICODE_PLANE_COUNT) | 
| 511 | 
            -
             | 
| 856 | 
            +
              Check_Type(num, T_FIXNUM);
         | 
| 857 | 
            +
              plane = FIX2INT(num);
         | 
| 858 | 
            +
              if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
         | 
| 859 | 
            +
              {
         | 
| 860 | 
            +
                rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
         | 
| 512 861 | 
             
              }
         | 
| 513 | 
            -
              return  | 
| 862 | 
            +
              return plane;
         | 
| 863 | 
            +
            }
         | 
| 864 | 
            +
             | 
| 865 | 
            +
            static VALUE
         | 
| 866 | 
            +
            cs_method_plane(VALUE self, VALUE plane_num)
         | 
| 867 | 
            +
            {
         | 
| 868 | 
            +
              cs_cp plane, plane_beg, plane_end;
         | 
| 869 | 
            +
              plane = cs_valid_plane_num(plane_num);
         | 
| 870 | 
            +
              plane_beg = plane * UNICODE_PLANE_SIZE;
         | 
| 871 | 
            +
              plane_end = (plane + 1) * MAX_BMP_CP;
         | 
| 872 | 
            +
              return cs_from_section(self, plane_beg, plane_end);
         | 
| 873 | 
            +
            }
         | 
| 874 | 
            +
             | 
| 875 | 
            +
            static VALUE
         | 
| 876 | 
            +
            cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
         | 
| 877 | 
            +
            {
         | 
| 878 | 
            +
              cs_ar *cps;
         | 
| 879 | 
            +
              cs_cp len;
         | 
| 880 | 
            +
              unsigned int plane;
         | 
| 881 | 
            +
              plane = cs_valid_plane_num(plane_num);
         | 
| 882 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 883 | 
            +
              return cs_has_cp_in_plane(cps, len, plane);
         | 
| 514 884 | 
             
            }
         | 
| 515 885 |  | 
| 516 886 | 
             
            #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
         | 
| 517 887 |  | 
| 518 888 | 
             
            static VALUE
         | 
| 519 | 
            -
             | 
| 520 | 
            -
             | 
| 521 | 
            -
               | 
| 522 | 
            -
               | 
| 523 | 
            -
               | 
| 889 | 
            +
            cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
         | 
| 890 | 
            +
            {
         | 
| 891 | 
            +
              int inc_surr;
         | 
| 892 | 
            +
              cs_cp upto, cp, len;
         | 
| 893 | 
            +
              cs_ar *cps;
         | 
| 894 | 
            +
              VALUE new_cs;
         | 
| 895 | 
            +
              struct cs_data *new_data;
         | 
| 896 | 
            +
             | 
| 524 897 | 
             
              rb_check_arity(argc, 0, 2);
         | 
| 525 | 
            -
             | 
| 526 | 
            -
               | 
| 527 | 
            -
             | 
| 528 | 
            -
             | 
| 529 | 
            -
             | 
| 530 | 
            -
             | 
| 898 | 
            +
             | 
| 899 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 900 | 
            +
              inc_surr = argc && argv[0] == Qtrue;
         | 
| 901 | 
            +
              new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
         | 
| 902 | 
            +
              upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
         | 
| 903 | 
            +
             | 
| 904 | 
            +
              for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
         | 
| 905 | 
            +
              {
         | 
| 906 | 
            +
                if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
         | 
| 907 | 
            +
                {
         | 
| 908 | 
            +
                  set_cp(new_data, cp);
         | 
| 909 | 
            +
                }
         | 
| 531 910 | 
             
              }
         | 
| 532 | 
            -
             | 
| 533 | 
            -
             | 
| 534 | 
            -
              );
         | 
| 911 | 
            +
             | 
| 912 | 
            +
              return new_cs;
         | 
| 535 913 | 
             
            }
         | 
| 536 914 |  | 
| 537 | 
            -
            typedef int(*str_cp_handler)(unsigned int,  | 
| 915 | 
            +
            typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
         | 
| 538 916 |  | 
| 539 917 | 
             
            static inline int
         | 
| 540 | 
            -
            add_str_cp_to_arr(unsigned int str_cp,  | 
| 541 | 
            -
             | 
| 918 | 
            +
            add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 919 | 
            +
            {
         | 
| 920 | 
            +
              set_cp(data, str_cp);
         | 
| 542 921 | 
             
              return 1;
         | 
| 543 922 | 
             
            }
         | 
| 544 923 |  | 
| 545 924 | 
             
            static VALUE
         | 
| 546 | 
            -
             | 
| 547 | 
            -
             | 
| 548 | 
            -
               | 
| 549 | 
            -
             | 
| 550 | 
            -
               | 
| 925 | 
            +
            cs_method_case_insensitive(VALUE self)
         | 
| 926 | 
            +
            {
         | 
| 927 | 
            +
              cs_cp i, len;
         | 
| 928 | 
            +
              cs_ar *cps;
         | 
| 929 | 
            +
              VALUE new_cs;
         | 
| 930 | 
            +
              struct cs_data *new_data;
         | 
| 551 931 |  | 
| 552 | 
            -
               | 
| 932 | 
            +
              cps = cs_fetch_cps(self, &len);
         | 
| 933 | 
            +
              new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
         | 
| 934 | 
            +
              cs_merge_cs(new_cs, self);
         | 
| 553 935 |  | 
| 554 | 
            -
              for (i = 0; i < CASEFOLD_COUNT; i++) | 
| 936 | 
            +
              for (i = 0; i < CASEFOLD_COUNT; i++)
         | 
| 937 | 
            +
              {
         | 
| 555 938 | 
             
                casefold_mapping m = unicode_casefold_table[i];
         | 
| 556 939 |  | 
| 557 | 
            -
                if | 
| 558 | 
            -
                 | 
| 940 | 
            +
                if (tst_cp(cps, len, m.from))
         | 
| 941 | 
            +
                {
         | 
| 942 | 
            +
                  set_cp(new_data, m.to);
         | 
| 943 | 
            +
                }
         | 
| 944 | 
            +
                else if (tst_cp(cps, len, m.to))
         | 
| 945 | 
            +
                {
         | 
| 946 | 
            +
                  set_cp(new_data, m.from);
         | 
| 947 | 
            +
                }
         | 
| 559 948 | 
             
              }
         | 
| 560 949 |  | 
| 561 | 
            -
              return  | 
| 950 | 
            +
              return new_cs;
         | 
| 562 951 |  | 
| 563 952 | 
             
              // OnigCaseFoldType flags;
         | 
| 564 953 | 
             
              // rb_encoding *enc;
         | 
| @@ -573,20 +962,27 @@ method_case_insensitive(VALUE self) { | |
| 573 962 | 
             
            }
         | 
| 574 963 |  | 
| 575 964 | 
             
            static inline VALUE
         | 
| 576 | 
            -
            each_sb_cp(VALUE str, str_cp_handler func,  | 
| 577 | 
            -
             | 
| 965 | 
            +
            each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 966 | 
            +
            {
         | 
| 967 | 
            +
              long i, str_len;
         | 
| 578 968 | 
             
              unsigned int str_cp;
         | 
| 969 | 
            +
              str_len = RSTRING_LEN(str);
         | 
| 579 970 |  | 
| 580 | 
            -
              for (i = 0; i <  | 
| 971 | 
            +
              for (i = 0; i < str_len; i++)
         | 
| 972 | 
            +
              {
         | 
| 581 973 | 
             
                str_cp = (RSTRING_PTR(str)[i] & 0xff);
         | 
| 582 | 
            -
                if (!(*func)(str_cp, cp_arr)) | 
| 974 | 
            +
                if (!(*func)(str_cp, cp_arr, len, data, memo))
         | 
| 975 | 
            +
                {
         | 
| 976 | 
            +
                  return Qfalse;
         | 
| 977 | 
            +
                }
         | 
| 583 978 | 
             
              }
         | 
| 584 979 |  | 
| 585 980 | 
             
              return Qtrue;
         | 
| 586 981 | 
             
            }
         | 
| 587 982 |  | 
| 588 983 | 
             
            static inline VALUE
         | 
| 589 | 
            -
            each_mb_cp(VALUE str, str_cp_handler func,  | 
| 984 | 
            +
            each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 985 | 
            +
            {
         | 
| 590 986 | 
             
              int n;
         | 
| 591 987 | 
             
              unsigned int str_cp;
         | 
| 592 988 | 
             
              const char *ptr, *end;
         | 
| @@ -597,9 +993,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) { | |
| 597 993 | 
             
              end = RSTRING_END(str);
         | 
| 598 994 | 
             
              enc = rb_enc_get(str);
         | 
| 599 995 |  | 
| 600 | 
            -
              while (ptr < end) | 
| 996 | 
            +
              while (ptr < end)
         | 
| 997 | 
            +
              {
         | 
| 601 998 | 
             
                str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
         | 
| 602 | 
            -
                if (!(*func)(str_cp, cp_arr)) | 
| 999 | 
            +
                if (!(*func)(str_cp, cp_arr, len, data, memo))
         | 
| 1000 | 
            +
                {
         | 
| 1001 | 
            +
                  return Qfalse;
         | 
| 1002 | 
            +
                }
         | 
| 603 1003 | 
             
                ptr += n;
         | 
| 604 1004 | 
             
              }
         | 
| 605 1005 |  | 
| @@ -611,105 +1011,238 @@ static inline int | |
| 611 1011 | 
             
            single_byte_optimizable(VALUE str)
         | 
| 612 1012 | 
             
            {
         | 
| 613 1013 | 
             
              rb_encoding *enc;
         | 
| 614 | 
            -
              if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) | 
| 1014 | 
            +
              if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
         | 
| 1015 | 
            +
              {
         | 
| 1016 | 
            +
                return 1;
         | 
| 1017 | 
            +
              }
         | 
| 615 1018 |  | 
| 616 1019 | 
             
              enc = rb_enc_get(str);
         | 
| 617 | 
            -
              if (rb_enc_mbmaxlen(enc) == 1) | 
| 1020 | 
            +
              if (rb_enc_mbmaxlen(enc) == 1)
         | 
| 1021 | 
            +
              {
         | 
| 1022 | 
            +
                return 1;
         | 
| 1023 | 
            +
              }
         | 
| 618 1024 |  | 
| 619 1025 | 
             
              return 0;
         | 
| 620 1026 | 
             
            }
         | 
| 621 1027 |  | 
| 622 1028 | 
             
            static inline VALUE
         | 
| 623 | 
            -
            each_cp(VALUE str, str_cp_handler func,  | 
| 624 | 
            -
             | 
| 625 | 
            -
             | 
| 1029 | 
            +
            each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 1030 | 
            +
            {
         | 
| 1031 | 
            +
              if (single_byte_optimizable(str))
         | 
| 1032 | 
            +
              {
         | 
| 1033 | 
            +
                return each_sb_cp(str, func, cp_arr, len, data, memo);
         | 
| 626 1034 | 
             
              }
         | 
| 627 | 
            -
              return each_mb_cp(str, func, cp_arr);
         | 
| 1035 | 
            +
              return each_mb_cp(str, func, cp_arr, len, data, memo);
         | 
| 628 1036 | 
             
            }
         | 
| 629 1037 |  | 
| 630 1038 | 
             
            static inline void
         | 
| 631 | 
            -
            raise_arg_err_unless_string(VALUE val) | 
| 632 | 
            -
             | 
| 1039 | 
            +
            raise_arg_err_unless_string(VALUE val)
         | 
| 1040 | 
            +
            {
         | 
| 1041 | 
            +
              if (!RB_TYPE_P(val, T_STRING))
         | 
| 1042 | 
            +
              {
         | 
| 1043 | 
            +
                rb_raise(rb_eArgError, "pass a String");
         | 
| 1044 | 
            +
              }
         | 
| 633 1045 | 
             
            }
         | 
| 634 1046 |  | 
| 635 1047 | 
             
            static VALUE
         | 
| 636 | 
            -
             | 
| 637 | 
            -
             | 
| 1048 | 
            +
            cs_class_method_of(VALUE self, VALUE str)
         | 
| 1049 | 
            +
            {
         | 
| 1050 | 
            +
              VALUE new_cs;
         | 
| 1051 | 
            +
              struct cs_data *new_data;
         | 
| 1052 | 
            +
              new_cs = cs_alloc(self, &new_data);
         | 
| 638 1053 | 
             
              raise_arg_err_unless_string(str);
         | 
| 639 | 
            -
               | 
| 640 | 
            -
               | 
| 641 | 
            -
              return NEW_CHARACTER_SET(self, cp_arr);
         | 
| 1054 | 
            +
              each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
         | 
| 1055 | 
            +
              return new_cs;
         | 
| 642 1056 | 
             
            }
         | 
| 643 1057 |  | 
| 644 1058 | 
             
            static inline int
         | 
| 645 | 
            -
             | 
| 646 | 
            -
             | 
| 1059 | 
            +
            count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 1060 | 
            +
            {
         | 
| 1061 | 
            +
              if (tst_cp(cp_arr, len, str_cp))
         | 
| 1062 | 
            +
              {
         | 
| 1063 | 
            +
                *memo += 1;
         | 
| 1064 | 
            +
              }
         | 
| 1065 | 
            +
              return 1;
         | 
| 647 1066 | 
             
            }
         | 
| 648 1067 |  | 
| 649 1068 | 
             
            static VALUE
         | 
| 650 | 
            -
             | 
| 651 | 
            -
             | 
| 652 | 
            -
              VALUE  | 
| 1069 | 
            +
            cs_method_count_in(VALUE self, VALUE str)
         | 
| 1070 | 
            +
            {
         | 
| 1071 | 
            +
              VALUE count;
         | 
| 1072 | 
            +
              struct cs_data *data;
         | 
| 653 1073 | 
             
              raise_arg_err_unless_string(str);
         | 
| 654 | 
            -
               | 
| 655 | 
            -
               | 
| 656 | 
            -
               | 
| 1074 | 
            +
              data = cs_fetch_data(self);
         | 
| 1075 | 
            +
              count = 0;
         | 
| 1076 | 
            +
              each_cp(str, count_str_cp, data->cps, data->len, data, &count);
         | 
| 1077 | 
            +
              return INT2NUM(count);
         | 
| 1078 | 
            +
            }
         | 
| 1079 | 
            +
             | 
| 1080 | 
            +
            static inline int
         | 
| 1081 | 
            +
            str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 1082 | 
            +
            {
         | 
| 1083 | 
            +
              return tst_cp(cp_arr, len, str_cp);
         | 
| 1084 | 
            +
            }
         | 
| 1085 | 
            +
             | 
| 1086 | 
            +
            static VALUE
         | 
| 1087 | 
            +
            cs_method_cover_p(VALUE self, VALUE str)
         | 
| 1088 | 
            +
            {
         | 
| 1089 | 
            +
              struct cs_data *data;
         | 
| 1090 | 
            +
              raise_arg_err_unless_string(str);
         | 
| 1091 | 
            +
              data = cs_fetch_data(self);
         | 
| 1092 | 
            +
              return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
         | 
| 1093 | 
            +
            }
         | 
| 1094 | 
            +
             | 
| 1095 | 
            +
            static inline int
         | 
| 1096 | 
            +
            add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 1097 | 
            +
            {
         | 
| 1098 | 
            +
              if (tst_cp(cp_arr, len, str_cp))
         | 
| 1099 | 
            +
              {
         | 
| 1100 | 
            +
                rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
         | 
| 1101 | 
            +
              }
         | 
| 1102 | 
            +
              return 1;
         | 
| 1103 | 
            +
            }
         | 
| 1104 | 
            +
             | 
| 1105 | 
            +
            static VALUE
         | 
| 1106 | 
            +
            cs_method_scan(VALUE self, VALUE str)
         | 
| 1107 | 
            +
            {
         | 
| 1108 | 
            +
              VALUE memo[2];
         | 
| 1109 | 
            +
              struct cs_data *data;
         | 
| 1110 | 
            +
              raise_arg_err_unless_string(str);
         | 
| 1111 | 
            +
              data = cs_fetch_data(self);
         | 
| 1112 | 
            +
              memo[0] = rb_ary_new();
         | 
| 1113 | 
            +
              memo[1] = (VALUE)rb_enc_get(str);
         | 
| 1114 | 
            +
              each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
         | 
| 1115 | 
            +
              return memo[0];
         | 
| 657 1116 | 
             
            }
         | 
| 658 1117 |  | 
| 659 1118 | 
             
            static inline int
         | 
| 660 | 
            -
             | 
| 661 | 
            -
             | 
| 1119 | 
            +
            str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
         | 
| 1120 | 
            +
            {
         | 
| 1121 | 
            +
              return !tst_cp(cp_arr, len, str_cp);
         | 
| 662 1122 | 
             
            }
         | 
| 663 1123 |  | 
| 664 1124 | 
             
            static VALUE
         | 
| 665 | 
            -
             | 
| 666 | 
            -
             | 
| 1125 | 
            +
            cs_method_used_by_p(VALUE self, VALUE str)
         | 
| 1126 | 
            +
            {
         | 
| 1127 | 
            +
              VALUE only_uses_other_cps;
         | 
| 1128 | 
            +
              struct cs_data *data;
         | 
| 667 1129 | 
             
              raise_arg_err_unless_string(str);
         | 
| 668 | 
            -
               | 
| 669 | 
            -
               | 
| 1130 | 
            +
              data = cs_fetch_data(self);
         | 
| 1131 | 
            +
              only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
         | 
| 1132 | 
            +
              return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
         | 
| 1133 | 
            +
            }
         | 
| 1134 | 
            +
             | 
| 1135 | 
            +
            static void
         | 
| 1136 | 
            +
            cs_str_buf_cat(VALUE str, const char *ptr, long len)
         | 
| 1137 | 
            +
            {
         | 
| 1138 | 
            +
              long total, olen;
         | 
| 1139 | 
            +
              char *sptr;
         | 
| 1140 | 
            +
             | 
| 1141 | 
            +
              RSTRING_GETMEM(str, sptr, olen);
         | 
| 1142 | 
            +
              sptr = RSTRING(str)->as.heap.ptr;
         | 
| 1143 | 
            +
              olen = RSTRING(str)->as.heap.len;
         | 
| 1144 | 
            +
              total = olen + len;
         | 
| 1145 | 
            +
              memcpy(sptr + olen, ptr, len);
         | 
| 1146 | 
            +
              RSTRING(str)->as.heap.len = total;
         | 
| 1147 | 
            +
            }
         | 
| 1148 | 
            +
             | 
| 1149 | 
            +
            #ifndef TERM_FILL
         | 
| 1150 | 
            +
            #define TERM_FILL(ptr, termlen)                     \
         | 
| 1151 | 
            +
              do                                                \
         | 
| 1152 | 
            +
              {                                                 \
         | 
| 1153 | 
            +
                char *const term_fill_ptr = (ptr);              \
         | 
| 1154 | 
            +
                const int term_fill_len = (termlen);            \
         | 
| 1155 | 
            +
                *term_fill_ptr = '\0';                          \
         | 
| 1156 | 
            +
                if (__builtin_expect(!!(term_fill_len > 1), 0)) \
         | 
| 1157 | 
            +
                  memset(term_fill_ptr, 0, term_fill_len);      \
         | 
| 1158 | 
            +
              } while (0)
         | 
| 1159 | 
            +
            #endif
         | 
| 1160 | 
            +
             | 
| 1161 | 
            +
            static void
         | 
| 1162 | 
            +
            cs_str_buf_terminate(VALUE str, rb_encoding *enc)
         | 
| 1163 | 
            +
            {
         | 
| 1164 | 
            +
              char *ptr;
         | 
| 1165 | 
            +
              long len;
         | 
| 1166 | 
            +
             | 
| 1167 | 
            +
              ptr = RSTRING(str)->as.heap.ptr;
         | 
| 1168 | 
            +
              len = RSTRING(str)->as.heap.len;
         | 
| 1169 | 
            +
              TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
         | 
| 670 1170 | 
             
            }
         | 
| 671 1171 |  | 
| 672 1172 | 
             
            static inline VALUE
         | 
| 673 | 
            -
             | 
| 674 | 
            -
             | 
| 1173 | 
            +
            cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
         | 
| 1174 | 
            +
            {
         | 
| 1175 | 
            +
              cs_ar *cps;
         | 
| 1176 | 
            +
              cs_cp len;
         | 
| 675 1177 | 
             
              rb_encoding *str_enc;
         | 
| 676 | 
            -
              VALUE orig_len,  | 
| 677 | 
            -
              int  | 
| 1178 | 
            +
              VALUE orig_len, new_str_buf;
         | 
| 1179 | 
            +
              int cp_len;
         | 
| 678 1180 | 
             
              unsigned int str_cp;
         | 
| 679 1181 | 
             
              const char *ptr, *end;
         | 
| 680 1182 |  | 
| 681 1183 | 
             
              raise_arg_err_unless_string(str);
         | 
| 682 1184 |  | 
| 683 | 
            -
               | 
| 1185 | 
            +
              cps = cs_fetch_cps(set, &len);
         | 
| 684 1186 |  | 
| 685 1187 | 
             
              orig_len = RSTRING_LEN(str);
         | 
| 686 | 
            -
               | 
| 687 | 
            -
               | 
| 1188 | 
            +
              if (orig_len < 1) // empty string, will never change
         | 
| 1189 | 
            +
              {
         | 
| 1190 | 
            +
                if (bang)
         | 
| 1191 | 
            +
                {
         | 
| 1192 | 
            +
                  return Qnil;
         | 
| 1193 | 
            +
                }
         | 
| 1194 | 
            +
                return rb_str_dup(str);
         | 
| 1195 | 
            +
              }
         | 
| 1196 | 
            +
             | 
| 1197 | 
            +
              new_str_buf = rb_str_buf_new(orig_len);
         | 
| 688 1198 | 
             
              str_enc = rb_enc_get(str);
         | 
| 689 1199 | 
             
              rb_enc_associate(new_str_buf, str_enc);
         | 
| 690 | 
            -
               | 
| 691 | 
            -
             | 
| 1200 | 
            +
              rb_str_modify(new_str_buf);
         | 
| 1201 | 
            +
              ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
         | 
| 692 1202 |  | 
| 693 1203 | 
             
              ptr = RSTRING_PTR(str);
         | 
| 694 1204 | 
             
              end = RSTRING_END(str);
         | 
| 695 1205 |  | 
| 696 | 
            -
               | 
| 697 | 
            -
             | 
| 698 | 
            -
                 | 
| 699 | 
            -
             | 
| 700 | 
            -
                   | 
| 1206 | 
            +
              if (single_byte_optimizable(str))
         | 
| 1207 | 
            +
              {
         | 
| 1208 | 
            +
                while (ptr < end)
         | 
| 1209 | 
            +
                {
         | 
| 1210 | 
            +
                  str_cp = *ptr & 0xff;
         | 
| 1211 | 
            +
                  if ((!tst_cp(cps, len, str_cp)) == delete)
         | 
| 1212 | 
            +
                  {
         | 
| 1213 | 
            +
                    cs_str_buf_cat(new_str_buf, ptr, 1);
         | 
| 1214 | 
            +
                  }
         | 
| 1215 | 
            +
                  ptr++;
         | 
| 1216 | 
            +
                }
         | 
| 1217 | 
            +
              }
         | 
| 1218 | 
            +
              else // likely to be multibyte string
         | 
| 1219 | 
            +
              {
         | 
| 1220 | 
            +
                while (ptr < end)
         | 
| 1221 | 
            +
                {
         | 
| 1222 | 
            +
                  str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
         | 
| 1223 | 
            +
                  if ((!tst_cp(cps, len, str_cp)) == delete)
         | 
| 1224 | 
            +
                  {
         | 
| 1225 | 
            +
                    cs_str_buf_cat(new_str_buf, ptr, cp_len);
         | 
| 1226 | 
            +
                  }
         | 
| 1227 | 
            +
                  ptr += cp_len;
         | 
| 701 1228 | 
             
                }
         | 
| 702 | 
            -
                ptr += n;
         | 
| 703 1229 | 
             
              }
         | 
| 704 1230 |  | 
| 705 | 
            -
               | 
| 706 | 
            -
             | 
| 1231 | 
            +
              cs_str_buf_terminate(new_str_buf, str_enc);
         | 
| 1232 | 
            +
             | 
| 1233 | 
            +
              if (bang)
         | 
| 1234 | 
            +
              {
         | 
| 1235 | 
            +
                if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
         | 
| 1236 | 
            +
                {
         | 
| 1237 | 
            +
                  return Qnil;
         | 
| 1238 | 
            +
                }
         | 
| 707 1239 | 
             
                rb_str_shared_replace(str, new_str_buf);
         | 
| 708 1240 | 
             
              }
         | 
| 709 | 
            -
              else | 
| 1241 | 
            +
              else
         | 
| 1242 | 
            +
              {
         | 
| 710 1243 | 
             
                RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
         | 
| 711 1244 | 
             
                // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
         | 
| 712 | 
            -
                RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
         | 
| 1245 | 
            +
                RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
         | 
| 713 1246 | 
             
                str = new_str_buf;
         | 
| 714 1247 | 
             
              }
         | 
| 715 1248 |  | 
| @@ -717,98 +1250,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) { | |
| 717 1250 | 
             
            }
         | 
| 718 1251 |  | 
| 719 1252 | 
             
            static VALUE
         | 
| 720 | 
            -
             | 
| 721 | 
            -
             | 
| 1253 | 
            +
            cs_method_delete_in(VALUE self, VALUE str)
         | 
| 1254 | 
            +
            {
         | 
| 1255 | 
            +
              return cs_apply_to_str(self, str, 1, 0);
         | 
| 1256 | 
            +
            }
         | 
| 1257 | 
            +
             | 
| 1258 | 
            +
            static VALUE
         | 
| 1259 | 
            +
            cs_method_delete_in_bang(VALUE self, VALUE str)
         | 
| 1260 | 
            +
            {
         | 
| 1261 | 
            +
              return cs_apply_to_str(self, str, 1, 1);
         | 
| 722 1262 | 
             
            }
         | 
| 723 1263 |  | 
| 724 1264 | 
             
            static VALUE
         | 
| 725 | 
            -
             | 
| 726 | 
            -
             | 
| 1265 | 
            +
            cs_method_keep_in(VALUE self, VALUE str)
         | 
| 1266 | 
            +
            {
         | 
| 1267 | 
            +
              return cs_apply_to_str(self, str, 0, 0);
         | 
| 727 1268 | 
             
            }
         | 
| 728 1269 |  | 
| 729 1270 | 
             
            static VALUE
         | 
| 730 | 
            -
             | 
| 731 | 
            -
             | 
| 1271 | 
            +
            cs_method_keep_in_bang(VALUE self, VALUE str)
         | 
| 1272 | 
            +
            {
         | 
| 1273 | 
            +
              return cs_apply_to_str(self, str, 0, 1);
         | 
| 732 1274 | 
             
            }
         | 
| 733 1275 |  | 
| 734 1276 | 
             
            static VALUE
         | 
| 735 | 
            -
             | 
| 736 | 
            -
             | 
| 1277 | 
            +
            cs_method_allocated_length(VALUE self)
         | 
| 1278 | 
            +
            {
         | 
| 1279 | 
            +
              return LONG2FIX(cs_fetch_data(self)->len);
         | 
| 737 1280 | 
             
            }
         | 
| 738 1281 |  | 
| 739 1282 | 
             
            // ****
         | 
| 740 1283 | 
             
            // init
         | 
| 741 1284 | 
             
            // ****
         | 
| 742 1285 |  | 
| 743 | 
            -
            void
         | 
| 744 | 
            -
            Init_character_set()
         | 
| 1286 | 
            +
            void Init_character_set()
         | 
| 745 1287 | 
             
            {
         | 
| 746 1288 | 
             
              VALUE cs = rb_define_class("CharacterSet", rb_cObject);
         | 
| 747 1289 |  | 
| 748 | 
            -
              rb_define_alloc_func(cs,  | 
| 1290 | 
            +
              rb_define_alloc_func(cs, cs_method_allocate);
         | 
| 749 1291 |  | 
| 750 1292 | 
             
              // `Set` compatibility methods
         | 
| 751 1293 |  | 
| 752 | 
            -
              rb_define_method(cs, "each", | 
| 753 | 
            -
              rb_define_method(cs, "to_a", | 
| 754 | 
            -
              rb_define_method(cs, "length", | 
| 755 | 
            -
              rb_define_method(cs, "size", | 
| 756 | 
            -
              rb_define_method(cs, " | 
| 757 | 
            -
              rb_define_method(cs, " | 
| 758 | 
            -
              rb_define_method(cs, " | 
| 759 | 
            -
              rb_define_method(cs, " | 
| 760 | 
            -
              rb_define_method(cs, " | 
| 761 | 
            -
              rb_define_method(cs, " | 
| 762 | 
            -
              rb_define_method(cs, " | 
| 763 | 
            -
              rb_define_method(cs, " | 
| 764 | 
            -
              rb_define_method(cs, " | 
| 765 | 
            -
              rb_define_method(cs, " | 
| 766 | 
            -
              rb_define_method(cs, " | 
| 767 | 
            -
              rb_define_method(cs, " | 
| 768 | 
            -
              rb_define_method(cs, " | 
| 769 | 
            -
              rb_define_method(cs, " | 
| 770 | 
            -
              rb_define_method(cs, " | 
| 771 | 
            -
              rb_define_method(cs, " | 
| 772 | 
            -
              rb_define_method(cs, " | 
| 773 | 
            -
              rb_define_method(cs, " | 
| 774 | 
            -
              rb_define_method(cs, " | 
| 775 | 
            -
              rb_define_method(cs, "add | 
| 776 | 
            -
              rb_define_method(cs, " | 
| 777 | 
            -
              rb_define_method(cs, " | 
| 778 | 
            -
              rb_define_method(cs, " | 
| 779 | 
            -
              rb_define_method(cs, " | 
| 780 | 
            -
              rb_define_method(cs, " | 
| 781 | 
            -
              rb_define_method(cs, " | 
| 782 | 
            -
              rb_define_method(cs, " | 
| 783 | 
            -
              rb_define_method(cs, " | 
| 784 | 
            -
              rb_define_method(cs, " | 
| 785 | 
            -
              rb_define_method(cs, " | 
| 786 | 
            -
              rb_define_method(cs, " | 
| 787 | 
            -
              rb_define_method(cs, " | 
| 788 | 
            -
              rb_define_method(cs, " | 
| 789 | 
            -
              rb_define_method(cs, " | 
| 790 | 
            -
              rb_define_method(cs, " | 
| 791 | 
            -
              rb_define_method(cs, " | 
| 792 | 
            -
              rb_define_method(cs, " | 
| 793 | 
            -
              rb_define_method(cs, " | 
| 1294 | 
            +
              rb_define_method(cs, "each", cs_method_each, 0);
         | 
| 1295 | 
            +
              rb_define_method(cs, "to_a", cs_method_to_a, -1);
         | 
| 1296 | 
            +
              rb_define_method(cs, "length", cs_method_length, 0);
         | 
| 1297 | 
            +
              rb_define_method(cs, "size", cs_method_length, 0);
         | 
| 1298 | 
            +
              rb_define_method(cs, "empty?", cs_method_empty_p, 0);
         | 
| 1299 | 
            +
              rb_define_method(cs, "hash", cs_method_hash, 0);
         | 
| 1300 | 
            +
              rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
         | 
| 1301 | 
            +
              rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
         | 
| 1302 | 
            +
              rb_define_method(cs, "clear", cs_method_clear, 0);
         | 
| 1303 | 
            +
              rb_define_method(cs, "min", cs_method_min, 0);
         | 
| 1304 | 
            +
              rb_define_method(cs, "max", cs_method_max, 0);
         | 
| 1305 | 
            +
              rb_define_method(cs, "minmax", cs_method_minmax, 0);
         | 
| 1306 | 
            +
              rb_define_method(cs, "intersection", cs_method_intersection, 1);
         | 
| 1307 | 
            +
              rb_define_method(cs, "&", cs_method_intersection, 1);
         | 
| 1308 | 
            +
              rb_define_method(cs, "union", cs_method_union, 1);
         | 
| 1309 | 
            +
              rb_define_method(cs, "+", cs_method_union, 1);
         | 
| 1310 | 
            +
              rb_define_method(cs, "|", cs_method_union, 1);
         | 
| 1311 | 
            +
              rb_define_method(cs, "difference", cs_method_difference, 1);
         | 
| 1312 | 
            +
              rb_define_method(cs, "-", cs_method_difference, 1);
         | 
| 1313 | 
            +
              rb_define_method(cs, "^", cs_method_exclusion, 1);
         | 
| 1314 | 
            +
              rb_define_method(cs, "include?", cs_method_include_p, 1);
         | 
| 1315 | 
            +
              rb_define_method(cs, "member?", cs_method_include_p, 1);
         | 
| 1316 | 
            +
              rb_define_method(cs, "===", cs_method_include_p, 1);
         | 
| 1317 | 
            +
              rb_define_method(cs, "add", cs_method_add, 1);
         | 
| 1318 | 
            +
              rb_define_method(cs, "<<", cs_method_add, 1);
         | 
| 1319 | 
            +
              rb_define_method(cs, "add?", cs_method_add_p, 1);
         | 
| 1320 | 
            +
              rb_define_method(cs, "delete", cs_method_delete, 1);
         | 
| 1321 | 
            +
              rb_define_method(cs, "delete?", cs_method_delete_p, 1);
         | 
| 1322 | 
            +
              rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
         | 
| 1323 | 
            +
              rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
         | 
| 1324 | 
            +
              rb_define_method(cs, "eql?", cs_method_eql_p, 1);
         | 
| 1325 | 
            +
              rb_define_method(cs, "==", cs_method_eql_p, 1);
         | 
| 1326 | 
            +
              rb_define_method(cs, "merge", cs_method_merge, 1);
         | 
| 1327 | 
            +
              rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
         | 
| 1328 | 
            +
              rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
         | 
| 1329 | 
            +
              rb_define_method(cs, "subtract", cs_method_subtract, 1);
         | 
| 1330 | 
            +
              rb_define_method(cs, "subset?", cs_method_subset_p, 1);
         | 
| 1331 | 
            +
              rb_define_method(cs, "<=", cs_method_subset_p, 1);
         | 
| 1332 | 
            +
              rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
         | 
| 1333 | 
            +
              rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
         | 
| 1334 | 
            +
              rb_define_method(cs, "superset?", cs_method_superset_p, 1);
         | 
| 1335 | 
            +
              rb_define_method(cs, ">=", cs_method_superset_p, 1);
         | 
| 1336 | 
            +
              rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
         | 
| 1337 | 
            +
              rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
         | 
| 794 1338 |  | 
| 795 1339 | 
             
              // `CharacterSet`-specific methods
         | 
| 796 1340 |  | 
| 797 | 
            -
              rb_define_singleton_method(cs, "from_ranges",  | 
| 798 | 
            -
              rb_define_singleton_method(cs, "of", | 
| 799 | 
            -
             | 
| 800 | 
            -
              rb_define_method(cs, "ranges", | 
| 801 | 
            -
              rb_define_method(cs, "sample", | 
| 802 | 
            -
              rb_define_method(cs, " | 
| 803 | 
            -
              rb_define_method(cs, " | 
| 804 | 
            -
              rb_define_method(cs, " | 
| 805 | 
            -
              rb_define_method(cs, " | 
| 806 | 
            -
              rb_define_method(cs, " | 
| 807 | 
            -
              rb_define_method(cs, " | 
| 808 | 
            -
              rb_define_method(cs, " | 
| 809 | 
            -
              rb_define_method(cs, " | 
| 810 | 
            -
              rb_define_method(cs, " | 
| 811 | 
            -
              rb_define_method(cs, " | 
| 812 | 
            -
              rb_define_method(cs, " | 
| 813 | 
            -
              rb_define_method(cs, " | 
| 1341 | 
            +
              rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
         | 
| 1342 | 
            +
              rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
         | 
| 1343 | 
            +
             | 
| 1344 | 
            +
              rb_define_method(cs, "ranges", cs_method_ranges, 0);
         | 
| 1345 | 
            +
              rb_define_method(cs, "sample", cs_method_sample, -1);
         | 
| 1346 | 
            +
              rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
         | 
| 1347 | 
            +
              rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
         | 
| 1348 | 
            +
              rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
         | 
| 1349 | 
            +
              rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
         | 
| 1350 | 
            +
              rb_define_method(cs, "planes", cs_method_planes, 0);
         | 
| 1351 | 
            +
              rb_define_method(cs, "plane", cs_method_plane, 1);
         | 
| 1352 | 
            +
              rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
         | 
| 1353 | 
            +
              rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
         | 
| 1354 | 
            +
              rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
         | 
| 1355 | 
            +
              rb_define_method(cs, "count_in", cs_method_count_in, 1);
         | 
| 1356 | 
            +
              rb_define_method(cs, "cover?", cs_method_cover_p, 1);
         | 
| 1357 | 
            +
              rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
         | 
| 1358 | 
            +
              rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
         | 
| 1359 | 
            +
              rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
         | 
| 1360 | 
            +
              rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
         | 
| 1361 | 
            +
              rb_define_method(cs, "scan", cs_method_scan, 1);
         | 
| 1362 | 
            +
              rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
         | 
| 1363 | 
            +
              rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
         | 
| 814 1364 | 
             
            }
         |