character_set 1.6.0-java → 1.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/BENCHMARK.md +32 -32
- data/CHANGELOG.md +15 -1
- data/README.md +1 -1
- data/Rakefile +2 -123
- data/character_set.gemspec +0 -7
- data/ext/character_set/character_set.c +64 -43
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +73 -52
- data/lib/character_set/predefined_sets/emoji.cps +10 -9
- data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
- data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
- data/lib/character_set/ruby_fallback.rb +2 -6
- data/lib/character_set/shared_methods.rb +2 -2
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +19 -28
- data/benchmarks/shared.rb +0 -30
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36050dd00f44b6efc26567bfd867ff21535fe1e35c9a8018d00f2145b27bfd37
|
4
|
+
data.tar.gz: d88d01cae2f5650271d73c654877b6cf62cf87acba9b8e699677b569c514b0e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 646450cc07172ffdbceaf6cf215c03a60487ced2fdf578c4467f08374b77f6a8d4e043cbb92c89ea2ebc39c5b5adf38f8d74502632e033dbc8982928d6002f99
|
7
|
+
data.tar.gz: 1d77ccb0abef9c591189a77ed862657a04020f6bf9b3f31b7760ced20cdbee962411e29375b7d3883e95d8c97cac992afc89b17dbdaafbe99f0af02cfa22a0e1
|
data/BENCHMARK.md
CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
|
|
3
3
|
```
|
4
4
|
Counting non-letters
|
5
5
|
|
6
|
-
CharacterSet#count_in:
|
7
|
-
String#count:
|
6
|
+
CharacterSet#count_in: 14627506.2 i/s
|
7
|
+
String#count: 3859777.0 i/s - 3.79x slower
|
8
8
|
```
|
9
9
|
```
|
10
10
|
Detecting non-whitespace
|
11
11
|
|
12
|
-
CharacterSet#cover?:
|
13
|
-
Regexp#match?:
|
12
|
+
CharacterSet#cover?: 17241902.8 i/s
|
13
|
+
Regexp#match?: 12971122.6 i/s - 1.33x slower
|
14
14
|
```
|
15
15
|
```
|
16
16
|
Detecting non-letters
|
17
17
|
|
18
|
-
CharacterSet#cover?:
|
19
|
-
Regexp#match?:
|
18
|
+
CharacterSet#cover?: 17243472.3 i/s
|
19
|
+
Regexp#match?: 7957626.9 i/s - 2.17x slower
|
20
20
|
```
|
21
21
|
```
|
22
22
|
Removing ASCII whitespace
|
23
23
|
|
24
|
-
CharacterSet#delete_in:
|
25
|
-
String#tr:
|
26
|
-
String#gsub:
|
24
|
+
CharacterSet#delete_in: 6190975.7 i/s
|
25
|
+
String#tr: 4722716.6 i/s - 1.31x slower
|
26
|
+
String#gsub: 214239.5 i/s - 28.90x slower
|
27
27
|
```
|
28
28
|
```
|
29
29
|
Removing whitespace, emoji and umlauts
|
30
30
|
|
31
|
-
CharacterSet#delete_in:
|
32
|
-
String#tr:
|
33
|
-
String#gsub:
|
31
|
+
CharacterSet#delete_in: 5890471.8 i/s
|
32
|
+
String#tr: 348506.8 i/s - 16.90x slower
|
33
|
+
String#gsub: 318268.3 i/s - 18.51x slower
|
34
34
|
```
|
35
35
|
```
|
36
36
|
Removing non-whitespace
|
37
37
|
|
38
|
-
CharacterSet#keep_in:
|
39
|
-
String#gsub:
|
40
|
-
String#tr:
|
38
|
+
CharacterSet#keep_in: 7396898.0 i/s
|
39
|
+
String#gsub: 208809.7 i/s - 35.42x slower
|
40
|
+
String#tr: 13.1 i/s - 564682.50x slower
|
41
41
|
```
|
42
42
|
```
|
43
43
|
Keeping only emoji
|
44
44
|
|
45
|
-
CharacterSet#keep_in:
|
46
|
-
String#gsub:
|
47
|
-
String#tr:
|
45
|
+
CharacterSet#keep_in: 7022741.1 i/s
|
46
|
+
String#gsub: 180939.6 i/s - 38.81x slower
|
47
|
+
String#tr: 13.1 i/s - 536724.50x slower
|
48
48
|
```
|
49
49
|
```
|
50
50
|
Extracting emoji to an Array
|
51
51
|
|
52
|
-
CharacterSet#scan:
|
53
|
-
String#scan:
|
52
|
+
CharacterSet#scan: 3023176.8 i/s
|
53
|
+
String#scan: 893225.8 i/s - 3.38x slower
|
54
54
|
```
|
55
55
|
```
|
56
56
|
Detecting whitespace
|
57
57
|
|
58
|
-
CharacterSet#used_by?:
|
59
|
-
Regexp#match?:
|
58
|
+
CharacterSet#used_by?: 17284025.9 i/s
|
59
|
+
Regexp#match?: 11847064.5 i/s - 1.46x slower
|
60
60
|
```
|
61
61
|
```
|
62
62
|
Detecting emoji in a large string
|
63
63
|
|
64
|
-
CharacterSet#used_by?:
|
65
|
-
Regexp#match?:
|
64
|
+
CharacterSet#used_by?: 341386.1 i/s
|
65
|
+
Regexp#match?: 183121.6 i/s - 1.86x slower
|
66
66
|
```
|
67
67
|
```
|
68
68
|
Adding entries
|
69
69
|
|
70
|
-
CharacterSet#add:
|
71
|
-
SortedSet#add:
|
70
|
+
CharacterSet#add: 4989762.3 i/s
|
71
|
+
SortedSet#add: 1157911.7 i/s - 4.31x slower
|
72
72
|
```
|
73
73
|
```
|
74
74
|
Removing entries
|
75
75
|
|
76
|
-
CharacterSet#delete:
|
77
|
-
SortedSet#delete:
|
76
|
+
CharacterSet#delete: 4996703.6 i/s
|
77
|
+
SortedSet#delete: 4177401.5 i/s - same-ish
|
78
78
|
```
|
79
79
|
```
|
80
80
|
Merging entries
|
81
81
|
|
82
|
-
CharacterSet#merge:
|
83
|
-
SortedSet#merge:
|
82
|
+
CharacterSet#merge: 666.7 i/s
|
83
|
+
SortedSet#merge: 4.0 i/s - 167.84x slower
|
84
84
|
```
|
85
85
|
```
|
86
86
|
Getting the min and max
|
87
87
|
|
88
|
-
CharacterSet#minmax:
|
89
|
-
SortedSet#minmax:
|
88
|
+
CharacterSet#minmax: 1596470.9 i/s
|
89
|
+
SortedSet#minmax: 866.4 i/s - 1842.74x slower
|
90
90
|
```
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [1.7.0] - 2023-05-12
|
10
|
+
|
11
|
+
### Added
|
12
|
+
|
13
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
|
14
|
+
|
15
|
+
### Fixed
|
16
|
+
|
17
|
+
- fixed processing of Strings that are not ASCII- or UTF8-encoded
|
18
|
+
- removed dependency on `set` and `sorted_set`
|
19
|
+
- thanks to https://github.com/mikebaldry for reporting a related issue (#2)
|
20
|
+
|
7
21
|
## [1.6.0] - 2022-02-16
|
8
22
|
|
9
23
|
### Added
|
@@ -63,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
63
77
|
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
64
78
|
- before, every set instance required 136 KB for codepoints
|
65
79
|
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
66
|
-
- `#count_in` and `#
|
80
|
+
- `#count_in` and `#scan` methods for `String` interaction
|
67
81
|
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
68
82
|
- conversion methods `#assigned_part`, `#valid_part`
|
69
83
|
- sectioning methods `#ascii_part`, `#plane(n)`
|
data/README.md
CHANGED
@@ -96,7 +96,7 @@ string # => ''
|
|
96
96
|
|
97
97
|
```ruby
|
98
98
|
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
99
|
-
CharacterSet.non_ascii.
|
99
|
+
CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
|
100
100
|
```
|
101
101
|
|
102
102
|
There is also a core extension for String interaction.
|
data/Rakefile
CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
|
|
3
3
|
require 'rubygems/package_task'
|
4
4
|
require 'rake/extensiontask'
|
5
5
|
|
6
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
7
|
+
|
6
8
|
RSpec::Core::RakeTask.new(:spec)
|
7
9
|
|
8
10
|
task default: :spec
|
@@ -34,129 +36,6 @@ end
|
|
34
36
|
|
35
37
|
task package: 'java:gem'
|
36
38
|
|
37
|
-
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
38
|
-
task :sync_ruby_spec do
|
39
|
-
require 'fileutils'
|
40
|
-
|
41
|
-
variants = {
|
42
|
-
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
43
|
-
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
44
|
-
}
|
45
|
-
|
46
|
-
# download fresh specs from ruby/spec repository
|
47
|
-
variants.each do |_, dir|
|
48
|
-
FileUtils.rm_rf(dir) if File.exist?(dir)
|
49
|
-
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
50
|
-
end
|
51
|
-
|
52
|
-
# make copies for each CharacterSet variant
|
53
|
-
base = variants.first[1]
|
54
|
-
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
55
|
-
|
56
|
-
# adapt specs to work with CharacterSet
|
57
|
-
variants.each do |class_name, dir|
|
58
|
-
Dir["#{dir}/**/*.rb"].each do |spec|
|
59
|
-
# ignore some tests that do not apply or are covered otherwise
|
60
|
-
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
61
|
-
File.delete(spec)
|
62
|
-
next
|
63
|
-
end
|
64
|
-
|
65
|
-
adapted_content =
|
66
|
-
File.read(spec).
|
67
|
-
# adapt class name
|
68
|
-
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
69
|
-
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
70
|
-
# get shared specs from a single shared dir at the parent level
|
71
|
-
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
72
|
-
# make 'mspec' syntax rspec-compatible
|
73
|
-
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
74
|
-
gsub(/be_(false|true)/, 'be \1').
|
75
|
-
gsub('stub!', 'stub').
|
76
|
-
gsub('mock', 'double').
|
77
|
-
gsub('@method', 'method').
|
78
|
-
# remove unneeded requires
|
79
|
-
gsub(/require 'set'\n/, '').
|
80
|
-
gsub(/require.*spec_helper.*\n/, '').
|
81
|
-
gsub(/\A\n+/, '').
|
82
|
-
# make examples use Integers/codepoints
|
83
|
-
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
84
|
-
gsub('"one"', '1').
|
85
|
-
gsub('"two"', '2').
|
86
|
-
gsub('"three"', '3').
|
87
|
-
gsub('"four"', '4').
|
88
|
-
gsub('"five"', '5').
|
89
|
-
gsub(/x.(size|length) == 3/, 'x != 3').
|
90
|
-
gsub(/x.(size|length) != 3/, 'x == 3').
|
91
|
-
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
92
|
-
|
93
|
-
File.open(spec, 'w') { |f| f.puts adapted_content }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# keep only one copy of the shared specs, at the parent level
|
98
|
-
FileUtils.rm_rf(base + '/../shared')
|
99
|
-
FileUtils.mv(base + '/shared', base + '/../')
|
100
|
-
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
101
|
-
end
|
102
|
-
|
103
|
-
desc 'Download unicode casefold data and write new C header file'
|
104
|
-
task :sync_casefold_data do
|
105
|
-
src_path = './CaseFolding.txt'
|
106
|
-
dst_path = './ext/character_set/unicode_casefold_table.h'
|
107
|
-
|
108
|
-
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
109
|
-
|
110
|
-
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
111
|
-
from, type, to = line.split(/\s*;\s*/).first(3)
|
112
|
-
# type 'C' stands for 'common', excludes mappings to multiple chars
|
113
|
-
hash[from] = to if type == 'C'
|
114
|
-
end.sort
|
115
|
-
|
116
|
-
content = File.read(dst_path + '.tmpl')
|
117
|
-
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
118
|
-
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
119
|
-
|
120
|
-
File.write(dst_path, content)
|
121
|
-
File.unlink(src_path)
|
122
|
-
end
|
123
|
-
|
124
|
-
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
125
|
-
task :sync_predefined_sets do
|
126
|
-
%w[assigned emoji whitespace].each do |prop|
|
127
|
-
require 'regexp_property_values'
|
128
|
-
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
|
-
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
desc 'Run all IPS benchmarks'
|
135
|
-
task :benchmark do
|
136
|
-
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
137
|
-
end
|
138
|
-
|
139
|
-
namespace :benchmark do
|
140
|
-
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
141
|
-
task :write_to_file do
|
142
|
-
$store_comparison_results = {}
|
143
|
-
|
144
|
-
Rake.application[:benchmark].invoke
|
145
|
-
|
146
|
-
File.open('BENCHMARK.md', 'w') do |f|
|
147
|
-
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
148
|
-
|
149
|
-
$store_comparison_results.each do |caption, result|
|
150
|
-
f.puts '```',
|
151
|
-
caption,
|
152
|
-
'',
|
153
|
-
result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
|
154
|
-
'```'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
39
|
unless RUBY_PLATFORM =~ /java/
|
161
40
|
# recompile before benchmarking or running specs
|
162
41
|
task(:benchmark).enhance([:compile])
|
data/character_set.gemspec
CHANGED
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
|
|
21
21
|
s.extensions = %w[ext/character_set/extconf.rb]
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
|
-
|
25
|
-
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
26
|
-
# This dependency is only used if the C extension is unavailable.
|
27
|
-
# JRuby has it in the stdlib.
|
28
|
-
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
|
-
s.add_dependency 'sorted_set', '~> 1.0'
|
30
|
-
end
|
31
24
|
end
|
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
|
376
376
|
cps = data->cps;
|
377
377
|
len = data->len;
|
378
378
|
cp = FIX2ULONG(cp_num);
|
379
|
-
if (return_nil_if_noop &&
|
379
|
+
if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
|
380
380
|
{
|
381
381
|
return Qnil;
|
382
382
|
}
|
383
|
+
|
384
|
+
if (on)
|
385
|
+
{
|
386
|
+
set_cp(data, cp);
|
387
|
+
}
|
383
388
|
else
|
384
389
|
{
|
385
|
-
|
386
|
-
{
|
387
|
-
set_cp(data, cp);
|
388
|
-
}
|
389
|
-
else
|
390
|
-
{
|
391
|
-
clr_cp(cps, len, cp);
|
392
|
-
}
|
393
|
-
return cs;
|
390
|
+
clr_cp(cps, len, cp);
|
394
391
|
}
|
392
|
+
return cs;
|
395
393
|
}
|
396
394
|
|
397
395
|
static VALUE
|
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
|
|
575
573
|
{
|
576
574
|
return cs_merge_cs(self, other);
|
577
575
|
}
|
578
|
-
|
576
|
+
if (TYPE(other) == T_ARRAY)
|
579
577
|
{
|
580
578
|
return cs_merge_rb_array(self, other);
|
581
579
|
}
|
@@ -917,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
|
917
915
|
return new_cs;
|
918
916
|
}
|
919
917
|
|
920
|
-
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE
|
918
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
|
921
919
|
|
922
920
|
static inline int
|
923
|
-
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
921
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
924
922
|
{
|
925
923
|
set_cp(data, str_cp);
|
926
924
|
return 1;
|
@@ -967,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
|
|
967
965
|
}
|
968
966
|
|
969
967
|
static inline VALUE
|
970
|
-
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
968
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
971
969
|
{
|
972
970
|
long i, str_len;
|
973
971
|
unsigned int str_cp;
|
@@ -986,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
986
984
|
}
|
987
985
|
|
988
986
|
static inline VALUE
|
989
|
-
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
987
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
990
988
|
{
|
991
989
|
int n;
|
992
990
|
unsigned int str_cp;
|
993
991
|
const char *ptr, *end;
|
994
|
-
rb_encoding *
|
992
|
+
rb_encoding *utf8;
|
993
|
+
|
994
|
+
utf8 = rb_utf8_encoding();
|
995
|
+
if (rb_enc_get(str) == utf8)
|
996
|
+
{
|
997
|
+
str = rb_str_new_frozen(str);
|
998
|
+
}
|
999
|
+
else
|
1000
|
+
{
|
1001
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1002
|
+
}
|
995
1003
|
|
996
|
-
str = rb_str_new_frozen(str);
|
997
1004
|
ptr = RSTRING_PTR(str);
|
998
1005
|
end = RSTRING_END(str);
|
999
|
-
enc = rb_enc_get(str);
|
1000
1006
|
|
1001
1007
|
while (ptr < end)
|
1002
1008
|
{
|
1003
|
-
str_cp = rb_enc_codepoint_len(ptr, end, &n,
|
1009
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
|
1004
1010
|
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1005
1011
|
{
|
1006
1012
|
return Qfalse;
|
@@ -1031,12 +1037,13 @@ single_byte_optimizable(VALUE str)
|
|
1031
1037
|
}
|
1032
1038
|
|
1033
1039
|
static inline VALUE
|
1034
|
-
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1040
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1035
1041
|
{
|
1036
1042
|
if (single_byte_optimizable(str))
|
1037
1043
|
{
|
1038
1044
|
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
1039
1045
|
}
|
1046
|
+
|
1040
1047
|
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
1041
1048
|
}
|
1042
1049
|
|
@@ -1062,11 +1069,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
|
|
1062
1069
|
}
|
1063
1070
|
|
1064
1071
|
static inline int
|
1065
|
-
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1072
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1066
1073
|
{
|
1067
1074
|
if (tst_cp(cp_arr, len, str_cp))
|
1068
1075
|
{
|
1069
|
-
*memo += 1;
|
1076
|
+
*((VALUE *)memo) += 1;
|
1070
1077
|
}
|
1071
1078
|
return 1;
|
1072
1079
|
}
|
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
|
|
1074
1081
|
static VALUE
|
1075
1082
|
cs_method_count_in(VALUE self, VALUE str)
|
1076
1083
|
{
|
1077
|
-
|
1084
|
+
long count;
|
1078
1085
|
struct cs_data *data;
|
1079
1086
|
raise_arg_err_unless_string(str);
|
1080
1087
|
data = cs_fetch_data(self);
|
1081
1088
|
count = 0;
|
1082
|
-
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1083
|
-
return
|
1089
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
|
1090
|
+
return LONG2FIX(count);
|
1084
1091
|
}
|
1085
1092
|
|
1086
1093
|
static inline int
|
1087
|
-
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1094
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1088
1095
|
{
|
1089
1096
|
return tst_cp(cp_arr, len, str_cp);
|
1090
1097
|
}
|
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
|
|
1099
1106
|
}
|
1100
1107
|
|
1101
1108
|
static inline int
|
1102
|
-
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1109
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1103
1110
|
{
|
1104
1111
|
if (tst_cp(cp_arr, len, str_cp))
|
1105
1112
|
{
|
1106
|
-
rb_ary_push(memo
|
1113
|
+
rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
|
1107
1114
|
}
|
1108
1115
|
return 1;
|
1109
1116
|
}
|
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
1111
1118
|
static VALUE
|
1112
1119
|
cs_method_scan(VALUE self, VALUE str)
|
1113
1120
|
{
|
1114
|
-
VALUE memo
|
1121
|
+
VALUE memo;
|
1115
1122
|
struct cs_data *data;
|
1116
1123
|
raise_arg_err_unless_string(str);
|
1117
1124
|
data = cs_fetch_data(self);
|
1118
|
-
memo
|
1119
|
-
memo[1] = (VALUE)rb_enc_get(str);
|
1125
|
+
memo = rb_ary_new();
|
1120
1126
|
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1121
|
-
return memo
|
1127
|
+
return memo;
|
1122
1128
|
}
|
1123
1129
|
|
1124
1130
|
static inline int
|
1125
|
-
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1131
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1126
1132
|
{
|
1127
1133
|
return !tst_cp(cp_arr, len, str_cp);
|
1128
1134
|
}
|
@@ -1146,9 +1152,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1146
1152
|
cs_cp cs_len;
|
1147
1153
|
VALUE orig_str_len;
|
1148
1154
|
|
1149
|
-
rb_encoding *
|
1155
|
+
rb_encoding *orig_enc, *utf8;
|
1150
1156
|
char *s, *send, *t;
|
1151
|
-
int
|
1157
|
+
int orig_was_utf8, cr;
|
1152
1158
|
|
1153
1159
|
raise_arg_err_unless_string(str);
|
1154
1160
|
|
@@ -1159,24 +1165,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1159
1165
|
return bang ? Qnil : str;
|
1160
1166
|
}
|
1161
1167
|
|
1162
|
-
|
1168
|
+
orig_enc = rb_enc_get(str);
|
1169
|
+
utf8 = rb_utf8_encoding();
|
1170
|
+
orig_was_utf8 = orig_enc == utf8;
|
1171
|
+
|
1172
|
+
if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
|
1173
|
+
{
|
1174
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1175
|
+
}
|
1176
|
+
else
|
1163
1177
|
{
|
1164
|
-
|
1178
|
+
if (!bang)
|
1179
|
+
{
|
1180
|
+
str = rb_str_dup(str);
|
1181
|
+
}
|
1165
1182
|
}
|
1166
1183
|
|
1167
1184
|
cps = cs_fetch_cps(set, &cs_len);
|
1168
1185
|
rb_str_modify(str);
|
1169
|
-
enc = rb_enc_get(str);
|
1170
|
-
ascompat = rb_enc_asciicompat(enc);
|
1171
1186
|
s = t = RSTRING_PTR(str);
|
1172
1187
|
send = RSTRING_END(str);
|
1173
|
-
cr =
|
1188
|
+
cr = ENC_CODERANGE_7BIT;
|
1189
|
+
|
1174
1190
|
while (s < send)
|
1175
1191
|
{
|
1176
1192
|
unsigned int c;
|
1177
1193
|
int clen;
|
1178
1194
|
|
1179
|
-
if (
|
1195
|
+
if ((c = *(unsigned char *)s) < 0x80)
|
1180
1196
|
{
|
1181
1197
|
if (tst_cp(cps, cs_len, c) != delete)
|
1182
1198
|
{
|
@@ -1188,12 +1204,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1188
1204
|
}
|
1189
1205
|
else
|
1190
1206
|
{
|
1191
|
-
c = rb_enc_codepoint_len(s, send, &clen,
|
1207
|
+
c = rb_enc_codepoint_len(s, send, &clen, utf8);
|
1192
1208
|
|
1193
1209
|
if (tst_cp(cps, cs_len, c) != delete)
|
1194
1210
|
{
|
1195
1211
|
if (t != s)
|
1196
|
-
rb_enc_mbcput(c, t,
|
1212
|
+
rb_enc_mbcput(c, t, utf8);
|
1197
1213
|
t += clen;
|
1198
1214
|
if (cr == ENC_CODERANGE_7BIT)
|
1199
1215
|
cr = ENC_CODERANGE_VALID;
|
@@ -1210,6 +1226,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1210
1226
|
return Qnil;
|
1211
1227
|
}
|
1212
1228
|
|
1229
|
+
if (!orig_was_utf8)
|
1230
|
+
{
|
1231
|
+
return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
|
1232
|
+
}
|
1233
|
+
|
1213
1234
|
return str;
|
1214
1235
|
}
|
1215
1236
|
|
data/lib/character_set/parser.rb
CHANGED
@@ -4,11 +4,15 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
def codepoints_from_enumerable(object)
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
|
+
|
7
8
|
# Use #each to check first element (only this works for all Enumerables)
|
8
|
-
object.each do |
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
object.each do |el| # rubocop:disable Lint/UnreachableLoop
|
10
|
+
if el.is_a?(Integer) && el >= 0 && el < 0x110000
|
11
|
+
return object
|
12
|
+
elsif el.is_a?(String) && el.length == 1
|
13
|
+
return object.to_a.join.encode('utf-8').codepoints
|
14
|
+
end
|
15
|
+
raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
|
12
16
|
end
|
13
17
|
end
|
14
18
|
|