character_set 1.6.0-java → 1.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/BENCHMARK.md +32 -32
- data/CHANGELOG.md +15 -1
- data/README.md +1 -1
- data/Rakefile +2 -123
- data/character_set.gemspec +0 -7
- data/ext/character_set/character_set.c +64 -43
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +73 -52
- data/lib/character_set/predefined_sets/emoji.cps +10 -9
- data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
- data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
- data/lib/character_set/ruby_fallback.rb +2 -6
- data/lib/character_set/shared_methods.rb +2 -2
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +19 -28
- data/benchmarks/shared.rb +0 -30
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36050dd00f44b6efc26567bfd867ff21535fe1e35c9a8018d00f2145b27bfd37
|
4
|
+
data.tar.gz: d88d01cae2f5650271d73c654877b6cf62cf87acba9b8e699677b569c514b0e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 646450cc07172ffdbceaf6cf215c03a60487ced2fdf578c4467f08374b77f6a8d4e043cbb92c89ea2ebc39c5b5adf38f8d74502632e033dbc8982928d6002f99
|
7
|
+
data.tar.gz: 1d77ccb0abef9c591189a77ed862657a04020f6bf9b3f31b7760ced20cdbee962411e29375b7d3883e95d8c97cac992afc89b17dbdaafbe99f0af02cfa22a0e1
|
data/BENCHMARK.md
CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
|
|
3
3
|
```
|
4
4
|
Counting non-letters
|
5
5
|
|
6
|
-
CharacterSet#count_in:
|
7
|
-
String#count:
|
6
|
+
CharacterSet#count_in: 14627506.2 i/s
|
7
|
+
String#count: 3859777.0 i/s - 3.79x slower
|
8
8
|
```
|
9
9
|
```
|
10
10
|
Detecting non-whitespace
|
11
11
|
|
12
|
-
CharacterSet#cover?:
|
13
|
-
Regexp#match?:
|
12
|
+
CharacterSet#cover?: 17241902.8 i/s
|
13
|
+
Regexp#match?: 12971122.6 i/s - 1.33x slower
|
14
14
|
```
|
15
15
|
```
|
16
16
|
Detecting non-letters
|
17
17
|
|
18
|
-
CharacterSet#cover?:
|
19
|
-
Regexp#match?:
|
18
|
+
CharacterSet#cover?: 17243472.3 i/s
|
19
|
+
Regexp#match?: 7957626.9 i/s - 2.17x slower
|
20
20
|
```
|
21
21
|
```
|
22
22
|
Removing ASCII whitespace
|
23
23
|
|
24
|
-
CharacterSet#delete_in:
|
25
|
-
String#tr:
|
26
|
-
String#gsub:
|
24
|
+
CharacterSet#delete_in: 6190975.7 i/s
|
25
|
+
String#tr: 4722716.6 i/s - 1.31x slower
|
26
|
+
String#gsub: 214239.5 i/s - 28.90x slower
|
27
27
|
```
|
28
28
|
```
|
29
29
|
Removing whitespace, emoji and umlauts
|
30
30
|
|
31
|
-
CharacterSet#delete_in:
|
32
|
-
String#tr:
|
33
|
-
String#gsub:
|
31
|
+
CharacterSet#delete_in: 5890471.8 i/s
|
32
|
+
String#tr: 348506.8 i/s - 16.90x slower
|
33
|
+
String#gsub: 318268.3 i/s - 18.51x slower
|
34
34
|
```
|
35
35
|
```
|
36
36
|
Removing non-whitespace
|
37
37
|
|
38
|
-
CharacterSet#keep_in:
|
39
|
-
String#gsub:
|
40
|
-
String#tr:
|
38
|
+
CharacterSet#keep_in: 7396898.0 i/s
|
39
|
+
String#gsub: 208809.7 i/s - 35.42x slower
|
40
|
+
String#tr: 13.1 i/s - 564682.50x slower
|
41
41
|
```
|
42
42
|
```
|
43
43
|
Keeping only emoji
|
44
44
|
|
45
|
-
CharacterSet#keep_in:
|
46
|
-
String#gsub:
|
47
|
-
String#tr:
|
45
|
+
CharacterSet#keep_in: 7022741.1 i/s
|
46
|
+
String#gsub: 180939.6 i/s - 38.81x slower
|
47
|
+
String#tr: 13.1 i/s - 536724.50x slower
|
48
48
|
```
|
49
49
|
```
|
50
50
|
Extracting emoji to an Array
|
51
51
|
|
52
|
-
CharacterSet#scan:
|
53
|
-
String#scan:
|
52
|
+
CharacterSet#scan: 3023176.8 i/s
|
53
|
+
String#scan: 893225.8 i/s - 3.38x slower
|
54
54
|
```
|
55
55
|
```
|
56
56
|
Detecting whitespace
|
57
57
|
|
58
|
-
CharacterSet#used_by?:
|
59
|
-
Regexp#match?:
|
58
|
+
CharacterSet#used_by?: 17284025.9 i/s
|
59
|
+
Regexp#match?: 11847064.5 i/s - 1.46x slower
|
60
60
|
```
|
61
61
|
```
|
62
62
|
Detecting emoji in a large string
|
63
63
|
|
64
|
-
CharacterSet#used_by?:
|
65
|
-
Regexp#match?:
|
64
|
+
CharacterSet#used_by?: 341386.1 i/s
|
65
|
+
Regexp#match?: 183121.6 i/s - 1.86x slower
|
66
66
|
```
|
67
67
|
```
|
68
68
|
Adding entries
|
69
69
|
|
70
|
-
CharacterSet#add:
|
71
|
-
SortedSet#add:
|
70
|
+
CharacterSet#add: 4989762.3 i/s
|
71
|
+
SortedSet#add: 1157911.7 i/s - 4.31x slower
|
72
72
|
```
|
73
73
|
```
|
74
74
|
Removing entries
|
75
75
|
|
76
|
-
CharacterSet#delete:
|
77
|
-
SortedSet#delete:
|
76
|
+
CharacterSet#delete: 4996703.6 i/s
|
77
|
+
SortedSet#delete: 4177401.5 i/s - same-ish
|
78
78
|
```
|
79
79
|
```
|
80
80
|
Merging entries
|
81
81
|
|
82
|
-
CharacterSet#merge:
|
83
|
-
SortedSet#merge:
|
82
|
+
CharacterSet#merge: 666.7 i/s
|
83
|
+
SortedSet#merge: 4.0 i/s - 167.84x slower
|
84
84
|
```
|
85
85
|
```
|
86
86
|
Getting the min and max
|
87
87
|
|
88
|
-
CharacterSet#minmax:
|
89
|
-
SortedSet#minmax:
|
88
|
+
CharacterSet#minmax: 1596470.9 i/s
|
89
|
+
SortedSet#minmax: 866.4 i/s - 1842.74x slower
|
90
90
|
```
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [1.7.0] - 2023-05-12
|
10
|
+
|
11
|
+
### Added
|
12
|
+
|
13
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
|
14
|
+
|
15
|
+
### Fixed
|
16
|
+
|
17
|
+
- fixed processing of Strings that are not ASCII- or UTF8-encoded
|
18
|
+
- removed dependency on `set` and `sorted_set`
|
19
|
+
- thanks to https://github.com/mikebaldry for reporting a related issue (#2)
|
20
|
+
|
7
21
|
## [1.6.0] - 2022-02-16
|
8
22
|
|
9
23
|
### Added
|
@@ -63,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
63
77
|
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
64
78
|
- before, every set instance required 136 KB for codepoints
|
65
79
|
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
66
|
-
- `#count_in` and `#
|
80
|
+
- `#count_in` and `#scan` methods for `String` interaction
|
67
81
|
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
68
82
|
- conversion methods `#assigned_part`, `#valid_part`
|
69
83
|
- sectioning methods `#ascii_part`, `#plane(n)`
|
data/README.md
CHANGED
@@ -96,7 +96,7 @@ string # => ''
|
|
96
96
|
|
97
97
|
```ruby
|
98
98
|
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
99
|
-
CharacterSet.non_ascii.
|
99
|
+
CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
|
100
100
|
```
|
101
101
|
|
102
102
|
There is also a core extension for String interaction.
|
data/Rakefile
CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
|
|
3
3
|
require 'rubygems/package_task'
|
4
4
|
require 'rake/extensiontask'
|
5
5
|
|
6
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
7
|
+
|
6
8
|
RSpec::Core::RakeTask.new(:spec)
|
7
9
|
|
8
10
|
task default: :spec
|
@@ -34,129 +36,6 @@ end
|
|
34
36
|
|
35
37
|
task package: 'java:gem'
|
36
38
|
|
37
|
-
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
38
|
-
task :sync_ruby_spec do
|
39
|
-
require 'fileutils'
|
40
|
-
|
41
|
-
variants = {
|
42
|
-
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
43
|
-
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
44
|
-
}
|
45
|
-
|
46
|
-
# download fresh specs from ruby/spec repository
|
47
|
-
variants.each do |_, dir|
|
48
|
-
FileUtils.rm_rf(dir) if File.exist?(dir)
|
49
|
-
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
50
|
-
end
|
51
|
-
|
52
|
-
# make copies for each CharacterSet variant
|
53
|
-
base = variants.first[1]
|
54
|
-
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
55
|
-
|
56
|
-
# adapt specs to work with CharacterSet
|
57
|
-
variants.each do |class_name, dir|
|
58
|
-
Dir["#{dir}/**/*.rb"].each do |spec|
|
59
|
-
# ignore some tests that do not apply or are covered otherwise
|
60
|
-
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
61
|
-
File.delete(spec)
|
62
|
-
next
|
63
|
-
end
|
64
|
-
|
65
|
-
adapted_content =
|
66
|
-
File.read(spec).
|
67
|
-
# adapt class name
|
68
|
-
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
69
|
-
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
70
|
-
# get shared specs from a single shared dir at the parent level
|
71
|
-
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
72
|
-
# make 'mspec' syntax rspec-compatible
|
73
|
-
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
74
|
-
gsub(/be_(false|true)/, 'be \1').
|
75
|
-
gsub('stub!', 'stub').
|
76
|
-
gsub('mock', 'double').
|
77
|
-
gsub('@method', 'method').
|
78
|
-
# remove unneeded requires
|
79
|
-
gsub(/require 'set'\n/, '').
|
80
|
-
gsub(/require.*spec_helper.*\n/, '').
|
81
|
-
gsub(/\A\n+/, '').
|
82
|
-
# make examples use Integers/codepoints
|
83
|
-
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
84
|
-
gsub('"one"', '1').
|
85
|
-
gsub('"two"', '2').
|
86
|
-
gsub('"three"', '3').
|
87
|
-
gsub('"four"', '4').
|
88
|
-
gsub('"five"', '5').
|
89
|
-
gsub(/x.(size|length) == 3/, 'x != 3').
|
90
|
-
gsub(/x.(size|length) != 3/, 'x == 3').
|
91
|
-
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
92
|
-
|
93
|
-
File.open(spec, 'w') { |f| f.puts adapted_content }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# keep only one copy of the shared specs, at the parent level
|
98
|
-
FileUtils.rm_rf(base + '/../shared')
|
99
|
-
FileUtils.mv(base + '/shared', base + '/../')
|
100
|
-
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
101
|
-
end
|
102
|
-
|
103
|
-
desc 'Download unicode casefold data and write new C header file'
|
104
|
-
task :sync_casefold_data do
|
105
|
-
src_path = './CaseFolding.txt'
|
106
|
-
dst_path = './ext/character_set/unicode_casefold_table.h'
|
107
|
-
|
108
|
-
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
109
|
-
|
110
|
-
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
111
|
-
from, type, to = line.split(/\s*;\s*/).first(3)
|
112
|
-
# type 'C' stands for 'common', excludes mappings to multiple chars
|
113
|
-
hash[from] = to if type == 'C'
|
114
|
-
end.sort
|
115
|
-
|
116
|
-
content = File.read(dst_path + '.tmpl')
|
117
|
-
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
118
|
-
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
119
|
-
|
120
|
-
File.write(dst_path, content)
|
121
|
-
File.unlink(src_path)
|
122
|
-
end
|
123
|
-
|
124
|
-
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
125
|
-
task :sync_predefined_sets do
|
126
|
-
%w[assigned emoji whitespace].each do |prop|
|
127
|
-
require 'regexp_property_values'
|
128
|
-
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
|
-
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
desc 'Run all IPS benchmarks'
|
135
|
-
task :benchmark do
|
136
|
-
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
137
|
-
end
|
138
|
-
|
139
|
-
namespace :benchmark do
|
140
|
-
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
141
|
-
task :write_to_file do
|
142
|
-
$store_comparison_results = {}
|
143
|
-
|
144
|
-
Rake.application[:benchmark].invoke
|
145
|
-
|
146
|
-
File.open('BENCHMARK.md', 'w') do |f|
|
147
|
-
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
148
|
-
|
149
|
-
$store_comparison_results.each do |caption, result|
|
150
|
-
f.puts '```',
|
151
|
-
caption,
|
152
|
-
'',
|
153
|
-
result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
|
154
|
-
'```'
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
39
|
unless RUBY_PLATFORM =~ /java/
|
161
40
|
# recompile before benchmarking or running specs
|
162
41
|
task(:benchmark).enhance([:compile])
|
data/character_set.gemspec
CHANGED
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
|
|
21
21
|
s.extensions = %w[ext/character_set/extconf.rb]
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
|
-
|
25
|
-
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
26
|
-
# This dependency is only used if the C extension is unavailable.
|
27
|
-
# JRuby has it in the stdlib.
|
28
|
-
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
|
-
s.add_dependency 'sorted_set', '~> 1.0'
|
30
|
-
end
|
31
24
|
end
|
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
|
376
376
|
cps = data->cps;
|
377
377
|
len = data->len;
|
378
378
|
cp = FIX2ULONG(cp_num);
|
379
|
-
if (return_nil_if_noop &&
|
379
|
+
if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
|
380
380
|
{
|
381
381
|
return Qnil;
|
382
382
|
}
|
383
|
+
|
384
|
+
if (on)
|
385
|
+
{
|
386
|
+
set_cp(data, cp);
|
387
|
+
}
|
383
388
|
else
|
384
389
|
{
|
385
|
-
|
386
|
-
{
|
387
|
-
set_cp(data, cp);
|
388
|
-
}
|
389
|
-
else
|
390
|
-
{
|
391
|
-
clr_cp(cps, len, cp);
|
392
|
-
}
|
393
|
-
return cs;
|
390
|
+
clr_cp(cps, len, cp);
|
394
391
|
}
|
392
|
+
return cs;
|
395
393
|
}
|
396
394
|
|
397
395
|
static VALUE
|
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
|
|
575
573
|
{
|
576
574
|
return cs_merge_cs(self, other);
|
577
575
|
}
|
578
|
-
|
576
|
+
if (TYPE(other) == T_ARRAY)
|
579
577
|
{
|
580
578
|
return cs_merge_rb_array(self, other);
|
581
579
|
}
|
@@ -917,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
|
917
915
|
return new_cs;
|
918
916
|
}
|
919
917
|
|
920
|
-
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE
|
918
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
|
921
919
|
|
922
920
|
static inline int
|
923
|
-
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
921
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
924
922
|
{
|
925
923
|
set_cp(data, str_cp);
|
926
924
|
return 1;
|
@@ -967,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
|
|
967
965
|
}
|
968
966
|
|
969
967
|
static inline VALUE
|
970
|
-
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
968
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
971
969
|
{
|
972
970
|
long i, str_len;
|
973
971
|
unsigned int str_cp;
|
@@ -986,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
986
984
|
}
|
987
985
|
|
988
986
|
static inline VALUE
|
989
|
-
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
987
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
990
988
|
{
|
991
989
|
int n;
|
992
990
|
unsigned int str_cp;
|
993
991
|
const char *ptr, *end;
|
994
|
-
rb_encoding *
|
992
|
+
rb_encoding *utf8;
|
993
|
+
|
994
|
+
utf8 = rb_utf8_encoding();
|
995
|
+
if (rb_enc_get(str) == utf8)
|
996
|
+
{
|
997
|
+
str = rb_str_new_frozen(str);
|
998
|
+
}
|
999
|
+
else
|
1000
|
+
{
|
1001
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1002
|
+
}
|
995
1003
|
|
996
|
-
str = rb_str_new_frozen(str);
|
997
1004
|
ptr = RSTRING_PTR(str);
|
998
1005
|
end = RSTRING_END(str);
|
999
|
-
enc = rb_enc_get(str);
|
1000
1006
|
|
1001
1007
|
while (ptr < end)
|
1002
1008
|
{
|
1003
|
-
str_cp = rb_enc_codepoint_len(ptr, end, &n,
|
1009
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
|
1004
1010
|
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1005
1011
|
{
|
1006
1012
|
return Qfalse;
|
@@ -1031,12 +1037,13 @@ single_byte_optimizable(VALUE str)
|
|
1031
1037
|
}
|
1032
1038
|
|
1033
1039
|
static inline VALUE
|
1034
|
-
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1040
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1035
1041
|
{
|
1036
1042
|
if (single_byte_optimizable(str))
|
1037
1043
|
{
|
1038
1044
|
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
1039
1045
|
}
|
1046
|
+
|
1040
1047
|
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
1041
1048
|
}
|
1042
1049
|
|
@@ -1062,11 +1069,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
|
|
1062
1069
|
}
|
1063
1070
|
|
1064
1071
|
static inline int
|
1065
|
-
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1072
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1066
1073
|
{
|
1067
1074
|
if (tst_cp(cp_arr, len, str_cp))
|
1068
1075
|
{
|
1069
|
-
*memo += 1;
|
1076
|
+
*((VALUE *)memo) += 1;
|
1070
1077
|
}
|
1071
1078
|
return 1;
|
1072
1079
|
}
|
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
|
|
1074
1081
|
static VALUE
|
1075
1082
|
cs_method_count_in(VALUE self, VALUE str)
|
1076
1083
|
{
|
1077
|
-
|
1084
|
+
long count;
|
1078
1085
|
struct cs_data *data;
|
1079
1086
|
raise_arg_err_unless_string(str);
|
1080
1087
|
data = cs_fetch_data(self);
|
1081
1088
|
count = 0;
|
1082
|
-
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1083
|
-
return
|
1089
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
|
1090
|
+
return LONG2FIX(count);
|
1084
1091
|
}
|
1085
1092
|
|
1086
1093
|
static inline int
|
1087
|
-
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1094
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1088
1095
|
{
|
1089
1096
|
return tst_cp(cp_arr, len, str_cp);
|
1090
1097
|
}
|
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
|
|
1099
1106
|
}
|
1100
1107
|
|
1101
1108
|
static inline int
|
1102
|
-
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1109
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1103
1110
|
{
|
1104
1111
|
if (tst_cp(cp_arr, len, str_cp))
|
1105
1112
|
{
|
1106
|
-
rb_ary_push(memo
|
1113
|
+
rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
|
1107
1114
|
}
|
1108
1115
|
return 1;
|
1109
1116
|
}
|
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
1111
1118
|
static VALUE
|
1112
1119
|
cs_method_scan(VALUE self, VALUE str)
|
1113
1120
|
{
|
1114
|
-
VALUE memo
|
1121
|
+
VALUE memo;
|
1115
1122
|
struct cs_data *data;
|
1116
1123
|
raise_arg_err_unless_string(str);
|
1117
1124
|
data = cs_fetch_data(self);
|
1118
|
-
memo
|
1119
|
-
memo[1] = (VALUE)rb_enc_get(str);
|
1125
|
+
memo = rb_ary_new();
|
1120
1126
|
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1121
|
-
return memo
|
1127
|
+
return memo;
|
1122
1128
|
}
|
1123
1129
|
|
1124
1130
|
static inline int
|
1125
|
-
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1131
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1126
1132
|
{
|
1127
1133
|
return !tst_cp(cp_arr, len, str_cp);
|
1128
1134
|
}
|
@@ -1146,9 +1152,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1146
1152
|
cs_cp cs_len;
|
1147
1153
|
VALUE orig_str_len;
|
1148
1154
|
|
1149
|
-
rb_encoding *
|
1155
|
+
rb_encoding *orig_enc, *utf8;
|
1150
1156
|
char *s, *send, *t;
|
1151
|
-
int
|
1157
|
+
int orig_was_utf8, cr;
|
1152
1158
|
|
1153
1159
|
raise_arg_err_unless_string(str);
|
1154
1160
|
|
@@ -1159,24 +1165,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1159
1165
|
return bang ? Qnil : str;
|
1160
1166
|
}
|
1161
1167
|
|
1162
|
-
|
1168
|
+
orig_enc = rb_enc_get(str);
|
1169
|
+
utf8 = rb_utf8_encoding();
|
1170
|
+
orig_was_utf8 = orig_enc == utf8;
|
1171
|
+
|
1172
|
+
if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
|
1173
|
+
{
|
1174
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1175
|
+
}
|
1176
|
+
else
|
1163
1177
|
{
|
1164
|
-
|
1178
|
+
if (!bang)
|
1179
|
+
{
|
1180
|
+
str = rb_str_dup(str);
|
1181
|
+
}
|
1165
1182
|
}
|
1166
1183
|
|
1167
1184
|
cps = cs_fetch_cps(set, &cs_len);
|
1168
1185
|
rb_str_modify(str);
|
1169
|
-
enc = rb_enc_get(str);
|
1170
|
-
ascompat = rb_enc_asciicompat(enc);
|
1171
1186
|
s = t = RSTRING_PTR(str);
|
1172
1187
|
send = RSTRING_END(str);
|
1173
|
-
cr =
|
1188
|
+
cr = ENC_CODERANGE_7BIT;
|
1189
|
+
|
1174
1190
|
while (s < send)
|
1175
1191
|
{
|
1176
1192
|
unsigned int c;
|
1177
1193
|
int clen;
|
1178
1194
|
|
1179
|
-
if (
|
1195
|
+
if ((c = *(unsigned char *)s) < 0x80)
|
1180
1196
|
{
|
1181
1197
|
if (tst_cp(cps, cs_len, c) != delete)
|
1182
1198
|
{
|
@@ -1188,12 +1204,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1188
1204
|
}
|
1189
1205
|
else
|
1190
1206
|
{
|
1191
|
-
c = rb_enc_codepoint_len(s, send, &clen,
|
1207
|
+
c = rb_enc_codepoint_len(s, send, &clen, utf8);
|
1192
1208
|
|
1193
1209
|
if (tst_cp(cps, cs_len, c) != delete)
|
1194
1210
|
{
|
1195
1211
|
if (t != s)
|
1196
|
-
rb_enc_mbcput(c, t,
|
1212
|
+
rb_enc_mbcput(c, t, utf8);
|
1197
1213
|
t += clen;
|
1198
1214
|
if (cr == ENC_CODERANGE_7BIT)
|
1199
1215
|
cr = ENC_CODERANGE_VALID;
|
@@ -1210,6 +1226,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1210
1226
|
return Qnil;
|
1211
1227
|
}
|
1212
1228
|
|
1229
|
+
if (!orig_was_utf8)
|
1230
|
+
{
|
1231
|
+
return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
|
1232
|
+
}
|
1233
|
+
|
1213
1234
|
return str;
|
1214
1235
|
}
|
1215
1236
|
|
data/lib/character_set/parser.rb
CHANGED
@@ -4,11 +4,15 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
def codepoints_from_enumerable(object)
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
|
+
|
7
8
|
# Use #each to check first element (only this works for all Enumerables)
|
8
|
-
object.each do |
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
object.each do |el| # rubocop:disable Lint/UnreachableLoop
|
10
|
+
if el.is_a?(Integer) && el >= 0 && el < 0x110000
|
11
|
+
return object
|
12
|
+
elsif el.is_a?(String) && el.length == 1
|
13
|
+
return object.to_a.join.encode('utf-8').codepoints
|
14
|
+
end
|
15
|
+
raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
|
12
16
|
end
|
13
17
|
end
|
14
18
|
|