utf8_proc 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/ext/utf8_proc/utf8_proc.c +98 -12
- data/lib/utf8_proc.rb +10 -0
- data/lib/utf8_proc/benchmark.rb +4 -2
- data/lib/utf8_proc/core_ext/string.rb +18 -5
- data/lib/utf8_proc/core_ext/string_jruby.rb +46 -23
- data/lib/utf8_proc/jruby.rb +12 -0
- data/lib/utf8_proc/version.rb +2 -1
- data/utf8_proc.gemspec +3 -2
- metadata +23 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33591974889df9c707aed4f9cb97b04f623736c1
|
4
|
+
data.tar.gz: c7d78939a0a7b9a7f4c5c5f74047aa05a75eee61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62c02a23182fe04aae49a257b8a5d40a7b344d3ea7ffb3070cd763a8224be9e045eb427eab04d7ed5e85a20e02e906f3f610a6f46404abcdb6d4dd5e08c4d8e0
|
7
|
+
data.tar.gz: 9ec79b0312bb78e1e0e3575432c52024d4cce0456cfd009bf2a8cf8acca196e8a53d7cd1012ee54444d0887425c6f2641fa86efe91a1e85207c9e1e476cba6a4
|
data/README.md
CHANGED
data/ext/utf8_proc/utf8_proc.c
CHANGED
@@ -65,58 +65,133 @@ static inline VALUE normInternal(VALUE *string, utf8proc_option_t options) {
|
|
65
65
|
return new_str;
|
66
66
|
}
|
67
67
|
|
68
|
-
|
69
|
-
|
68
|
+
/**
|
69
|
+
* Normalizes a String using NFC (Canonical Decomposition, followed by Canonical
|
70
|
+
* Composition)
|
71
|
+
*
|
72
|
+
* @param string [String] the String to normalize
|
73
|
+
*
|
74
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
75
|
+
* @return [String] a normalized string
|
76
|
+
*/
|
70
77
|
static VALUE toNFC(VALUE self, VALUE string) {
|
71
78
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
72
79
|
}
|
73
80
|
|
81
|
+
/**
|
82
|
+
* Normalizes self using NFC (Canonical Decomposition, followed by Canonical
|
83
|
+
* Composition)
|
84
|
+
*
|
85
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
86
|
+
* @return [String] a normalized copy of the string
|
87
|
+
*/
|
74
88
|
static VALUE StoNFC(VALUE string) {
|
75
89
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
76
90
|
}
|
77
91
|
|
78
|
-
|
79
|
-
|
92
|
+
/**
|
93
|
+
* Normalizes a string using NFD (Canonical Decomposition)
|
94
|
+
*
|
95
|
+
* @param string [String] the String to normalize
|
96
|
+
*
|
97
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
98
|
+
* @return [String] a normalized string
|
99
|
+
*/
|
80
100
|
static VALUE toNFD(VALUE self, VALUE string) {
|
81
101
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
82
102
|
}
|
83
103
|
|
104
|
+
/**
|
105
|
+
* Normalizes self using NFD (Canonical Decomposition)
|
106
|
+
*
|
107
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
108
|
+
* @return [String] a normalized copy of the string
|
109
|
+
*/
|
84
110
|
static VALUE StoNFD(VALUE string) {
|
85
111
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
86
112
|
}
|
87
113
|
|
88
|
-
|
89
|
-
|
114
|
+
/**
|
115
|
+
* Normalizes a string using NFKC (Compatibility Decomposition, followed by
|
116
|
+
* Canonical Composition)
|
117
|
+
*
|
118
|
+
* @param string [String] the String to normalize
|
119
|
+
*
|
120
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
121
|
+
* @return [String] a normalized string
|
122
|
+
*/
|
90
123
|
static VALUE toNFKC(VALUE self, VALUE string) {
|
91
124
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
92
125
|
}
|
93
126
|
|
127
|
+
/**
|
128
|
+
* Normalizes self using NFKC (Compatibility Decomposition, followed by
|
129
|
+
* Canonical Composition)
|
130
|
+
*
|
131
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
132
|
+
* @return [String] a normalized copy of the string
|
133
|
+
*/
|
94
134
|
static VALUE StoNFKC(VALUE string) {
|
95
135
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
96
136
|
}
|
97
137
|
|
98
|
-
|
99
|
-
|
138
|
+
/**
|
139
|
+
* Normalizes a string using NFKD (Compatibility Decomposition)
|
140
|
+
*
|
141
|
+
* @param string [String] the String to normalize
|
142
|
+
*
|
143
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
144
|
+
* @return [String] a normalized string
|
145
|
+
*/
|
100
146
|
static VALUE toNFKD(VALUE self, VALUE string) {
|
101
147
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
102
148
|
}
|
103
149
|
|
150
|
+
/**
|
151
|
+
* Normalizes self using NFKD (Compatibility Decomposition)
|
152
|
+
*
|
153
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
154
|
+
* @return [String] a normalized copy of the string
|
155
|
+
*/
|
104
156
|
static VALUE StoNFKD(VALUE string) {
|
105
157
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
106
158
|
}
|
107
159
|
|
108
|
-
|
109
|
-
|
160
|
+
/**
|
161
|
+
* Normalizes a string using NFKC (Compatibility Decomposition, followed by
|
162
|
+
* Canonical Composition) with case-folding
|
163
|
+
*
|
164
|
+
* @param string [String] the String to normalize
|
165
|
+
*
|
166
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
167
|
+
* @return [String] a normalized string
|
168
|
+
*/
|
110
169
|
static VALUE toNFKC_CF(VALUE self, VALUE string) {
|
111
170
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
112
171
|
}
|
113
172
|
|
173
|
+
/**
|
174
|
+
* Normalizes self using NFKC (Compatibility Decomposition, followed by
|
175
|
+
* Canonical Composition) with case-folding
|
176
|
+
*
|
177
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
178
|
+
* @return [String] a normalized copy of the string
|
179
|
+
*/
|
114
180
|
static VALUE StoNFKC_CF(VALUE string) {
|
115
181
|
return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
116
182
|
}
|
117
183
|
|
118
|
-
|
119
|
-
|
184
|
+
/**
|
185
|
+
* @overload normalize(string, form = :nfc)
|
186
|
+
* Normalizes a string according to one of the 5 possible forms
|
187
|
+
*
|
188
|
+
* @param string [String] the String to normalize
|
189
|
+
* @param form [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] the normalization form
|
190
|
+
*
|
191
|
+
* @raise [EncodingError] if *string* is not encoded in *UTF-8* or *US-ASCII*
|
192
|
+
* @raise [ArgumentError] if *form* is not one of the 5 valid forms
|
193
|
+
* @return [String] a normalized string
|
194
|
+
*/
|
120
195
|
static VALUE toNorm(int argc, VALUE* argv, VALUE self){
|
121
196
|
VALUE string;
|
122
197
|
VALUE form;
|
@@ -145,6 +220,16 @@ static VALUE toNorm(int argc, VALUE* argv, VALUE self){
|
|
145
220
|
}
|
146
221
|
}
|
147
222
|
|
223
|
+
/**
|
224
|
+
* @overload normalize(string, form = :nfc)
|
225
|
+
* Normalizes self according to one of the 5 possible forms
|
226
|
+
*
|
227
|
+
* @param form [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] the normalization form
|
228
|
+
*
|
229
|
+
* @raise [EncodingError] if *self* is not encoded in *UTF-8* or *US-ASCII*
|
230
|
+
* @raise [ArgumentError] if *form* is not one of the 5 valid forms
|
231
|
+
* @return [String] a normalized copy of the string
|
232
|
+
*/
|
148
233
|
static VALUE StoNorm(int argc, VALUE* argv, VALUE string){
|
149
234
|
VALUE form;
|
150
235
|
rb_scan_args(argc, argv, "01", &form);
|
@@ -186,6 +271,7 @@ void Init_utf8_proc(void) {
|
|
186
271
|
|
187
272
|
const char *libVersion;
|
188
273
|
libVersion = utf8proc_version();
|
274
|
+
// Displays the library version of the utf8proc library
|
189
275
|
rb_define_const(rb_mBase, "LIBRARY_VERSION", rb_str_freeze(
|
190
276
|
rb_enc_str_new(libVersion, strlen(libVersion), enc_utf8)
|
191
277
|
));
|
data/lib/utf8_proc.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require "utf8_proc/version"
|
3
3
|
require "utf8_proc/benchmark"
|
4
4
|
|
5
|
+
# Unicode string normalization library using UTF8Proc
|
5
6
|
module UTF8Proc
|
6
7
|
if RUBY_ENGINE == "jruby"
|
7
8
|
require "utf8_proc/jruby"
|
@@ -9,4 +10,13 @@ module UTF8Proc
|
|
9
10
|
else
|
10
11
|
require "utf8_proc/utf8_proc"
|
11
12
|
end
|
13
|
+
|
14
|
+
# Add lowercase name aliases for normalization methods
|
15
|
+
class << self
|
16
|
+
alias nfc NFC
|
17
|
+
alias nfd NFD
|
18
|
+
alias nfkc NFKC
|
19
|
+
alias nfkd NFKD
|
20
|
+
alias nfkc_cf NFKC_CF
|
21
|
+
end
|
12
22
|
end
|
data/lib/utf8_proc/benchmark.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
-
|
2
|
+
|
3
3
|
module UTF8Proc
|
4
|
+
# Benchmark module for comparing the speed of *UTF8Proc* and *UNF*
|
4
5
|
module Benchmark
|
5
6
|
module_function
|
6
7
|
|
7
|
-
|
8
|
+
# Runs the benchmark and displays the results.
|
9
|
+
def run # rubocop:disable MethodLength
|
8
10
|
require "benchmark/ips"
|
9
11
|
require "unf"
|
10
12
|
# Various different normalizations of Unicode characters.
|
@@ -2,10 +2,23 @@
|
|
2
2
|
|
3
3
|
require "utf8_proc"
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
module UTF8Proc
|
6
|
+
# Module containing C core extension methods for the {::String} class.
|
7
|
+
#
|
8
|
+
# You can activate this by using:
|
9
|
+
# require "utf8_proc/core_ext/string"
|
10
|
+
#
|
11
|
+
# It will load either C or Java extensions, depending on your Ruby version.
|
12
|
+
module StringExtension
|
13
|
+
if RUBY_ENGINE == "jruby"
|
14
|
+
require "utf8_proc/core_ext/string_jruby"
|
15
|
+
else
|
16
|
+
alias nfc NFC
|
17
|
+
alias nfd NFD
|
18
|
+
alias nfkc NFKC
|
19
|
+
alias nfkd NFKD
|
20
|
+
alias nfkc_cf NFKC_CF
|
21
|
+
String.send(:include, ::UTF8Proc::StringExtension)
|
22
|
+
end
|
10
23
|
end
|
11
24
|
end
|
@@ -6,28 +6,51 @@
|
|
6
6
|
require "java"
|
7
7
|
require "utf8_proc"
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
::
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
9
|
+
module UTF8Proc
|
10
|
+
module JRuby
|
11
|
+
# Module containing JRuby core extension methods for the {::String} class.
|
12
|
+
#
|
13
|
+
# You can activate this by using:
|
14
|
+
# require "utf8_proc/core_ext/string"
|
15
|
+
#
|
16
|
+
# It will load either C or Java extensions, depending on your Ruby version.
|
17
|
+
module StringExtension
|
18
|
+
# @see UTF8Proc::StringExtension#NFC
|
19
|
+
def NFC
|
20
|
+
::UTF8Proc.NFC(self)
|
21
|
+
end
|
22
|
+
alias nfc NFC
|
23
|
+
|
24
|
+
# @see UTF8Proc::StringExtension#NFD
|
25
|
+
def NFD
|
26
|
+
::UTF8Proc.NFD(self)
|
27
|
+
end
|
28
|
+
alias nfd NFD
|
29
|
+
|
30
|
+
# @see UTF8Proc::StringExtension#NFKC
|
31
|
+
def NFKC
|
32
|
+
::UTF8Proc.NFKC(self)
|
33
|
+
end
|
34
|
+
alias nfkc NFKC
|
35
|
+
|
36
|
+
# @see UTF8Proc::StringExtension#NFKD
|
37
|
+
def NFKD
|
38
|
+
::UTF8Proc.NFKD(self)
|
39
|
+
end
|
40
|
+
alias nfkd NFKD
|
41
|
+
|
42
|
+
# @see UTF8Proc::StringExtension#NFKC_CF
|
43
|
+
def NFKC_CF
|
44
|
+
::UTF8Proc.NFKC_CF(self)
|
45
|
+
end
|
46
|
+
alias nfkc_cf NFKC_CF
|
47
|
+
|
48
|
+
# @see UTF8Proc::StringExtension#normalize
|
49
|
+
def normalize(form = :nfc)
|
50
|
+
::UTF8Proc.normalize(self, form)
|
51
|
+
end
|
52
|
+
end
|
32
53
|
end
|
33
54
|
end
|
55
|
+
|
56
|
+
String.send(:include, ::UTF8Proc::JRuby::StringExtension)
|
data/lib/utf8_proc/jruby.rb
CHANGED
@@ -6,37 +6,49 @@
|
|
6
6
|
require "java"
|
7
7
|
|
8
8
|
module UTF8Proc
|
9
|
+
# JRuby normalization module.
|
10
|
+
#
|
11
|
+
# This module will load automatically depending on your Ruby version.
|
9
12
|
module JRuby
|
13
|
+
# Displays your version of the Java VM
|
10
14
|
LIBRARY_VERSION = "Java #{ENV_JAVA['java.version']}".freeze
|
11
15
|
|
12
16
|
JTNORM = java.text.Normalizer
|
13
17
|
private_constant :JTNORM
|
14
18
|
|
19
|
+
# @!visibility private
|
15
20
|
def self.included(receiver)
|
16
21
|
receiver.extend(ClassMethods)
|
17
22
|
end
|
18
23
|
|
24
|
+
# Methods added to the {::UTF8Proc} module in JRuby (instead of the C ones)
|
19
25
|
module ClassMethods
|
26
|
+
# @see UTF8Proc.NFC
|
20
27
|
def NFC(string)
|
21
28
|
JTNORM.normalize(string, JTNORM::Form::NFC)
|
22
29
|
end
|
23
30
|
|
31
|
+
# @see UTF8Proc.NFD
|
24
32
|
def NFD(string)
|
25
33
|
JTNORM.normalize(string, JTNORM::Form::NFD)
|
26
34
|
end
|
27
35
|
|
36
|
+
# @see UTF8Proc.NFKC
|
28
37
|
def NFKC(string)
|
29
38
|
JTNORM.normalize(string, JTNORM::Form::NFKC)
|
30
39
|
end
|
31
40
|
|
41
|
+
# @see UTF8Proc.NFKD
|
32
42
|
def NFKD(string)
|
33
43
|
JTNORM.normalize(string, JTNORM::Form::NFKD)
|
34
44
|
end
|
35
45
|
|
46
|
+
# @see UTF8Proc.NFKC_CF
|
36
47
|
def NFKC_CF(string)
|
37
48
|
NFKC(string).to_java(:string).toLowerCase
|
38
49
|
end
|
39
50
|
|
51
|
+
# @see UTF8Proc.normalize
|
40
52
|
def normalize(string, form = :nfc)
|
41
53
|
case form
|
42
54
|
when :nfc
|
data/lib/utf8_proc/version.rb
CHANGED
data/utf8_proc.gemspec
CHANGED
@@ -32,8 +32,9 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "pry", "~> 0.10"
|
33
33
|
spec.add_development_dependency "minitest", "~> 5.10"
|
34
34
|
spec.add_development_dependency "rubocop", "~> 0.47"
|
35
|
-
spec.add_development_dependency "
|
36
|
-
spec.add_development_dependency "
|
35
|
+
spec.add_development_dependency "yard", "~> 0.9"
|
36
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
37
|
+
spec.add_development_dependency "unf", "~> 0.1"
|
37
38
|
|
38
39
|
unless RUBY_ENGINE == "jruby"
|
39
40
|
spec.extensions = ["ext/utf8_proc/extconf.rb"]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_proc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
@@ -80,34 +80,48 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.47'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.9'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.9'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: benchmark-ips
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '2.7'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '2.7'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: unf
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
117
|
+
version: '0.1'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
124
|
+
version: '0.1'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rake-compiler
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|