utf8_proc 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/ext/utf8_proc/utf8_proc.c +98 -12
- data/lib/utf8_proc.rb +10 -0
- data/lib/utf8_proc/benchmark.rb +4 -2
- data/lib/utf8_proc/core_ext/string.rb +18 -5
- data/lib/utf8_proc/core_ext/string_jruby.rb +46 -23
- data/lib/utf8_proc/jruby.rb +12 -0
- data/lib/utf8_proc/version.rb +2 -1
- data/utf8_proc.gemspec +3 -2
- metadata +23 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33591974889df9c707aed4f9cb97b04f623736c1
|
4
|
+
data.tar.gz: c7d78939a0a7b9a7f4c5c5f74047aa05a75eee61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62c02a23182fe04aae49a257b8a5d40a7b344d3ea7ffb3070cd763a8224be9e045eb427eab04d7ed5e85a20e02e906f3f610a6f46404abcdb6d4dd5e08c4d8e0
|
7
|
+
data.tar.gz: 9ec79b0312bb78e1e0e3575432c52024d4cce0456cfd009bf2a8cf8acca196e8a53d7cd1012ee54444d0887425c6f2641fa86efe91a1e85207c9e1e476cba6a4
|
data/README.md
CHANGED
data/ext/utf8_proc/utf8_proc.c
CHANGED
@@ -65,58 +65,133 @@ static inline VALUE normInternal(VALUE *string, utf8proc_option_t options) {
|
|
65
65
|
return new_str;
|
66
66
|
}
|
67
67
|
|
68
|
-
|
69
|
-
|
68
|
+
/**
|
69
|
+
* Normalizes a String using NFC (Canonical Decomposition, followed by Canonical
|
70
|
+
* Composition)
|
71
|
+
*
|
72
|
+
* @param string [String] the String to normalize
|
73
|
+
*
|
74
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
75
|
+
* @return [String] a normalized string
|
76
|
+
*/
|
70
77
|
static VALUE toNFC(VALUE self, VALUE string) {
|
71
78
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
72
79
|
}
|
73
80
|
|
81
|
+
/**
|
82
|
+
* Normalizes self using NFC (Canonical Decomposition, followed by Canonical
|
83
|
+
* Composition)
|
84
|
+
*
|
85
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
86
|
+
* @return [String] a normalized copy of the string
|
87
|
+
*/
|
74
88
|
static VALUE StoNFC(VALUE string) {
|
75
89
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
76
90
|
}
|
77
91
|
|
78
|
-
|
79
|
-
|
92
|
+
/**
|
93
|
+
* Normalizes a string using NFD (Canonical Decomposition)
|
94
|
+
*
|
95
|
+
* @param string [String] the String to normalize
|
96
|
+
*
|
97
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
98
|
+
* @return [String] a normalized string
|
99
|
+
*/
|
80
100
|
static VALUE toNFD(VALUE self, VALUE string) {
|
81
101
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
82
102
|
}
|
83
103
|
|
104
|
+
/**
|
105
|
+
* Normalizes self using NFD (Canonical Decomposition)
|
106
|
+
*
|
107
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
108
|
+
* @return [String] a normalized copy of the string
|
109
|
+
*/
|
84
110
|
static VALUE StoNFD(VALUE string) {
|
85
111
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
86
112
|
}
|
87
113
|
|
88
|
-
|
89
|
-
|
114
|
+
/**
|
115
|
+
* Normalizes a string using NFKC (Compatibility Decomposition, followed by
|
116
|
+
* Canonical Composition)
|
117
|
+
*
|
118
|
+
* @param string [String] the String to normalize
|
119
|
+
*
|
120
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
121
|
+
* @return [String] a normalized string
|
122
|
+
*/
|
90
123
|
static VALUE toNFKC(VALUE self, VALUE string) {
|
91
124
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
92
125
|
}
|
93
126
|
|
127
|
+
/**
|
128
|
+
* Normalizes self using NFKC (Compatibility Decomposition, followed by
|
129
|
+
* Canonical Composition)
|
130
|
+
*
|
131
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
132
|
+
* @return [String] a normalized copy of the string
|
133
|
+
*/
|
94
134
|
static VALUE StoNFKC(VALUE string) {
|
95
135
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
96
136
|
}
|
97
137
|
|
98
|
-
|
99
|
-
|
138
|
+
/**
|
139
|
+
* Normalizes a string using NFKD (Compatibility Decomposition)
|
140
|
+
*
|
141
|
+
* @param string [String] the String to normalize
|
142
|
+
*
|
143
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
144
|
+
* @return [String] a normalized string
|
145
|
+
*/
|
100
146
|
static VALUE toNFKD(VALUE self, VALUE string) {
|
101
147
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
102
148
|
}
|
103
149
|
|
150
|
+
/**
|
151
|
+
* Normalizes self using NFKD (Compatibility Decomposition)
|
152
|
+
*
|
153
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
154
|
+
* @return [String] a normalized copy of the string
|
155
|
+
*/
|
104
156
|
static VALUE StoNFKD(VALUE string) {
|
105
157
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
106
158
|
}
|
107
159
|
|
108
|
-
|
109
|
-
|
160
|
+
/**
|
161
|
+
* Normalizes a string using NFKC (Compatibility Decomposition, followed by
|
162
|
+
* Canonical Composition) with case-folding
|
163
|
+
*
|
164
|
+
* @param string [String] the String to normalize
|
165
|
+
*
|
166
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
167
|
+
* @return [String] a normalized string
|
168
|
+
*/
|
110
169
|
static VALUE toNFKC_CF(VALUE self, VALUE string) {
|
111
170
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
112
171
|
}
|
113
172
|
|
173
|
+
/**
|
174
|
+
* Normalizes self using NFKC (Compatibility Decomposition, followed by
|
175
|
+
* Canonical Composition) with case-folding
|
176
|
+
*
|
177
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
178
|
+
* @return [String] a normalized copy of the string
|
179
|
+
*/
|
114
180
|
static VALUE StoNFKC_CF(VALUE string) {
|
115
181
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
116
182
|
}
|
117
183
|
|
118
|
-
|
119
|
-
|
184
|
+
/**
|
185
|
+
* @overload normalize(string, form = :nfc)
|
186
|
+
* Normalizes a string according to one of the 5 possible forms
|
187
|
+
*
|
188
|
+
* @param string [String] the String to normalize
|
189
|
+
* @param form [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] the normalization form
|
190
|
+
*
|
191
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
192
|
+
* @raise [ArgumentError] if *form* is not one of the 5 valid forms
|
193
|
+
* @return [String] a normalized string
|
194
|
+
*/
|
120
195
|
static VALUE toNorm(int argc, VALUE* argv, VALUE self){
|
121
196
|
VALUE string;
|
122
197
|
VALUE form;
|
@@ -145,6 +220,16 @@ static VALUE toNorm(int argc, VALUE* argv, VALUE self){
|
|
145
220
|
}
|
146
221
|
}
|
147
222
|
|
223
|
+
/**
|
224
|
+
* @overload normalize(string, form = :nfc)
|
225
|
+
* Normalizes self according to one of the 5 possible forms
|
226
|
+
*
|
227
|
+
* @param form [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] the normalization form
|
228
|
+
*
|
229
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
230
|
+
* @raise [ArgumentError] if *form* is not one of the 5 valid forms
|
231
|
+
* @return [String] a normalized copy of the string
|
232
|
+
*/
|
148
233
|
static VALUE StoNorm(int argc, VALUE* argv, VALUE string){
|
149
234
|
VALUE form;
|
150
235
|
rb_scan_args(argc, argv, "01", &form);
|
@@ -186,6 +271,7 @@ void Init_utf8_proc(void) {
|
|
186
271
|
|
187
272
|
const char *libVersion;
|
188
273
|
libVersion = utf8proc_version();
|
274
|
+
// Displays the library version of the utf8proc library
|
189
275
|
rb_define_const(rb_mBase, "LIBRARY_VERSION", rb_str_freeze(
|
190
276
|
rb_enc_str_new(libVersion, strlen(libVersion), enc_utf8)
|
191
277
|
));
|
data/lib/utf8_proc.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require "utf8_proc/version"
|
3
3
|
require "utf8_proc/benchmark"
|
4
4
|
|
5
|
+
# Unicode string normalization library using UTF8Proc
|
5
6
|
module UTF8Proc
|
6
7
|
if RUBY_ENGINE == "jruby"
|
7
8
|
require "utf8_proc/jruby"
|
@@ -9,4 +10,13 @@ module UTF8Proc
|
|
9
10
|
else
|
10
11
|
require "utf8_proc/utf8_proc"
|
11
12
|
end
|
13
|
+
|
14
|
+
# Add lowercase name aliases for normalization methods
|
15
|
+
class << self
|
16
|
+
alias nfc NFC
|
17
|
+
alias nfd NFD
|
18
|
+
alias nfkc NFKC
|
19
|
+
alias nfkd NFKD
|
20
|
+
alias nfkc_cf NFKC_CF
|
21
|
+
end
|
12
22
|
end
|
data/lib/utf8_proc/benchmark.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
-
|
2
|
+
|
3
3
|
module UTF8Proc
|
4
|
+
# Benchmark module for comparing the speed of *UTF8Proc* and *UNF*
|
4
5
|
module Benchmark
|
5
6
|
module_function
|
6
7
|
|
7
|
-
|
8
|
+
# Runs the benchmark and displays the results.
|
9
|
+
def run # rubocop:disable MethodLength
|
8
10
|
require "benchmark/ips"
|
9
11
|
require "unf"
|
10
12
|
# Various different normalizations of Unicode characters.
|
@@ -2,10 +2,23 @@
|
|
2
2
|
|
3
3
|
require "utf8_proc"
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
module UTF8Proc
|
6
|
+
# Module containing C core extension methods for the {::String} class.
|
7
|
+
#
|
8
|
+
# You can activate this by using:
|
9
|
+
# require "utf8_proc/core_ext/string"
|
10
|
+
#
|
11
|
+
# It will load either C or Java extensions, depending on your Ruby version.
|
12
|
+
module StringExtension
|
13
|
+
if RUBY_ENGINE == "jruby"
|
14
|
+
require "utf8_proc/core_ext/string_jruby"
|
15
|
+
else
|
16
|
+
alias nfc NFC
|
17
|
+
alias nfd NFD
|
18
|
+
alias nfkc NFKC
|
19
|
+
alias nfkd NFKD
|
20
|
+
alias nfkc_cf NFKC_CF
|
21
|
+
String.send(:include, ::UTF8Proc::StringExtension)
|
22
|
+
end
|
10
23
|
end
|
11
24
|
end
|
@@ -6,28 +6,51 @@
|
|
6
6
|
require "java"
|
7
7
|
require "utf8_proc"
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
::
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
9
|
+
module UTF8Proc
|
10
|
+
module JRuby
|
11
|
+
# Module containing JRuby core extension methods for the {::String} class.
|
12
|
+
#
|
13
|
+
# You can activate this by using:
|
14
|
+
# require "utf8_proc/core_ext/string"
|
15
|
+
#
|
16
|
+
# It will load either C or Java extensions, depending on your Ruby version.
|
17
|
+
module StringExtension
|
18
|
+
# @see UTF8Proc::StringExtension#NFC
|
19
|
+
def NFC
|
20
|
+
::UTF8Proc.NFC(self)
|
21
|
+
end
|
22
|
+
alias nfc NFC
|
23
|
+
|
24
|
+
# @see UTF8Proc::StringExtension#NFD
|
25
|
+
def NFD
|
26
|
+
::UTF8Proc.NFD(self)
|
27
|
+
end
|
28
|
+
alias nfd NFD
|
29
|
+
|
30
|
+
# @see UTF8Proc::StringExtension#NFKC
|
31
|
+
def NFKC
|
32
|
+
::UTF8Proc.NFKC(self)
|
33
|
+
end
|
34
|
+
alias nfkc NFKC
|
35
|
+
|
36
|
+
# @see UTF8Proc::StringExtension#NFKD
|
37
|
+
def NFKD
|
38
|
+
::UTF8Proc.NFKD(self)
|
39
|
+
end
|
40
|
+
alias nfkd NFKD
|
41
|
+
|
42
|
+
# @see UTF8Proc::StringExtension#NFKC_CF
|
43
|
+
def NFKC_CF
|
44
|
+
::UTF8Proc.NFKC_CF(self)
|
45
|
+
end
|
46
|
+
alias nfkc_cf NFKC_CF
|
47
|
+
|
48
|
+
# @see UTF8Proc::StringExtension#normalize
|
49
|
+
def normalize(form = :nfc)
|
50
|
+
::UTF8Proc.normalize(self, form)
|
51
|
+
end
|
52
|
+
end
|
32
53
|
end
|
33
54
|
end
|
55
|
+
|
56
|
+
String.send(:include, ::UTF8Proc::JRuby::StringExtension)
|
data/lib/utf8_proc/jruby.rb
CHANGED
@@ -6,37 +6,49 @@
|
|
6
6
|
require "java"
|
7
7
|
|
8
8
|
module UTF8Proc
|
9
|
+
# JRuby normalization module.
|
10
|
+
#
|
11
|
+
# This module will load automatically depending on your Ruby version.
|
9
12
|
module JRuby
|
13
|
+
# Displays your version of the Java VM
|
10
14
|
LIBRARY_VERSION = "Java #{ENV_JAVA['java.version']}".freeze
|
11
15
|
|
12
16
|
JTNORM = java.text.Normalizer
|
13
17
|
private_constant :JTNORM
|
14
18
|
|
19
|
+
# @!visibility private
|
15
20
|
def self.included(receiver)
|
16
21
|
receiver.extend(ClassMethods)
|
17
22
|
end
|
18
23
|
|
24
|
+
# Methods added to the {::UTF8Proc} module in JRuby (instead of the C ones)
|
19
25
|
module ClassMethods
|
26
|
+
# @see UTF8Proc.NFC
|
20
27
|
def NFC(string)
|
21
28
|
JTNORM.normalize(string, JTNORM::Form::NFC)
|
22
29
|
end
|
23
30
|
|
31
|
+
# @see UTF8Proc.NFD
|
24
32
|
def NFD(string)
|
25
33
|
JTNORM.normalize(string, JTNORM::Form::NFD)
|
26
34
|
end
|
27
35
|
|
36
|
+
# @see UTF8Proc.NFKC
|
28
37
|
def NFKC(string)
|
29
38
|
JTNORM.normalize(string, JTNORM::Form::NFKC)
|
30
39
|
end
|
31
40
|
|
41
|
+
# @see UTF8Proc.NFKD
|
32
42
|
def NFKD(string)
|
33
43
|
JTNORM.normalize(string, JTNORM::Form::NFKD)
|
34
44
|
end
|
35
45
|
|
46
|
+
# @see UTF8Proc.NFKC_CF
|
36
47
|
def NFKC_CF(string)
|
37
48
|
NFKC(string).to_java(:string).toLowerCase
|
38
49
|
end
|
39
50
|
|
51
|
+
# @see UTF8Proc.normalize
|
40
52
|
def normalize(string, form = :nfc)
|
41
53
|
case form
|
42
54
|
when :nfc
|
data/lib/utf8_proc/version.rb
CHANGED
data/utf8_proc.gemspec
CHANGED
@@ -32,8 +32,9 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "pry", "~> 0.10"
|
33
33
|
spec.add_development_dependency "minitest", "~> 5.10"
|
34
34
|
spec.add_development_dependency "rubocop", "~> 0.47"
|
35
|
-
spec.add_development_dependency "
|
36
|
-
spec.add_development_dependency "
|
35
|
+
spec.add_development_dependency "yard", "~> 0.9"
|
36
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
37
|
+
spec.add_development_dependency "unf", "~> 0.1"
|
37
38
|
|
38
39
|
unless RUBY_ENGINE == "jruby"
|
39
40
|
spec.extensions = ["ext/utf8_proc/extconf.rb"]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_proc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
@@ -80,34 +80,48 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.47'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.9'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.9'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: benchmark-ips
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '2.7'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '2.7'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: unf
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
117
|
+
version: '0.1'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
124
|
+
version: '0.1'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rake-compiler
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|