utf8_proc 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +24 -6
- data/ext/utf8_proc/utf8_proc.c +32 -22
- data/lib/utf8_proc/version.rb +1 -1
- data/utf8_proc.gemspec +1 -1
- metadata +2 -3
- data/ext/utf8_proc/.clang_complete +0 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a10b2518012c6465c1365bf5d1f5b74ffe31c1f8
|
4
|
+
data.tar.gz: 8d62ad653f961acbe83a0886b1aec36e0c545343
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6802fae4cf0b8a3d515541f540f2cc377b6d3c10ac10ea48fa0b7b045169083143f44d77b6bb7b5aebd2d507365e438ca07316a447f8f2563c328c21d6bc2269
|
7
|
+
data.tar.gz: 8b7b35123552afd904a2ec05fc92301b9f03876f9721b53e7e95e3bd644dd37241fd98b94808750280b142eb02b7e897b3a00c9357354847ee70657c617d3522
|
data/README.md
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
# UTF8Proc
|
2
2
|
|
3
|
-
|
3
|
+
A simple wrapper around [utf8proc](https://github.com/JuliaLang/utf8proc) for normalizing Unicode strings. Requires the `utf8proc` library and headers to be installed on your system. *(Packages are available. OSX: `brew install utf8proc`, Linux: `libutf8proc-dev` or `utf8proc-devel`)*
|
4
4
|
|
5
|
-
|
5
|
+
Currently supports UTF-8/ASCII string input and NFC, NFD, NFKC, NFKD, and NKFC-Casefold forms. Handles Unicode 9.0 and includes the current official full suite of 9.0 normalization tests.
|
6
|
+
|
7
|
+
Quick benchmarks against the [UNF](https://github.com/knu/ruby-unf) gem show it to be between the same speed (best-case) and ~2x slower (worst-case), averaging about ~1.2x slower on complex Unicode strings. The speed difference is more equal in NFC/NFD modes where mostly or already-normalized strings are used.
|
8
|
+
|
9
|
+
*(Note: UNF is generally a bit faster but currently officially supports Unicode 6.0 and does not pass all 9.0 normalization tests.)*
|
6
10
|
|
7
11
|
## Installation
|
8
12
|
|
@@ -22,13 +26,27 @@ Or install it yourself as:
|
|
22
26
|
|
23
27
|
## Usage
|
24
28
|
|
25
|
-
|
29
|
+
```ruby
|
30
|
+
require "utf8_proc"
|
31
|
+
|
32
|
+
# Canonical Decomposition, followed by Canonical Composition
|
33
|
+
UTF8Proc.NFC(utf8_string)
|
26
34
|
|
27
|
-
|
35
|
+
# Canonical Decomposition
|
36
|
+
UTF8Proc.NFD(utf8_string)
|
28
37
|
|
29
|
-
|
38
|
+
# Compatibility Decomposition, followed by Canonical Composition
|
39
|
+
UTF8Proc.NFKC(utf8_string)
|
30
40
|
|
31
|
-
|
41
|
+
# Compatibility Decomposition
|
42
|
+
UTF8Proc.NFKD(utf8_string)
|
43
|
+
|
44
|
+
# Compatibility Decomposition, followed by Canonical Composition with Case-folding
|
45
|
+
UTF8Proc.NFKC_CF(utf8_string)
|
46
|
+
|
47
|
+
# Second argument may be any of: [:nfc (default), :nfd, :nfkc, :nfkd, :nfkc_cf]
|
48
|
+
UTF8Proc.normalize(utf8_string, form = :nfc)
|
49
|
+
```
|
32
50
|
|
33
51
|
## Contributing
|
34
52
|
|
data/ext/utf8_proc/utf8_proc.c
CHANGED
@@ -7,6 +7,7 @@ ID NFC;
|
|
7
7
|
ID NFD;
|
8
8
|
ID NFKC;
|
9
9
|
ID NFKD;
|
10
|
+
ID NFKC_CF;
|
10
11
|
|
11
12
|
static inline void checkStrEncoding(VALUE *string) {
|
12
13
|
rb_encoding *enc = rb_enc_get(*string);
|
@@ -15,7 +16,7 @@ static inline void checkStrEncoding(VALUE *string) {
|
|
15
16
|
}
|
16
17
|
}
|
17
18
|
|
18
|
-
static inline VALUE
|
19
|
+
static inline VALUE normInternal(VALUE string, utf8proc_option_t options) {
|
19
20
|
checkStrEncoding(&string);
|
20
21
|
utf8proc_uint8_t *retval;
|
21
22
|
utf8proc_ssize_t retlen = utf8proc_map(
|
@@ -28,43 +29,50 @@ static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
|
|
28
29
|
}
|
29
30
|
|
30
31
|
|
31
|
-
VALUE
|
32
|
-
return
|
32
|
+
VALUE toNFC(VALUE self, VALUE string) {
|
33
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
33
34
|
}
|
34
35
|
|
35
|
-
VALUE
|
36
|
-
return
|
36
|
+
VALUE toNFD(VALUE self, VALUE string) {
|
37
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
37
38
|
}
|
38
39
|
|
39
|
-
VALUE
|
40
|
-
return
|
40
|
+
VALUE toNFKC(VALUE self, VALUE string) {
|
41
|
+
return normInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
41
42
|
}
|
42
43
|
|
43
|
-
VALUE
|
44
|
-
return
|
44
|
+
VALUE toNFKD(VALUE self, VALUE string) {
|
45
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
45
46
|
}
|
46
47
|
|
47
|
-
VALUE
|
48
|
+
VALUE toNFKC_CF(VALUE self, VALUE string) {
|
49
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
50
|
+
}
|
51
|
+
|
52
|
+
|
53
|
+
VALUE norm(int argc, VALUE* argv, VALUE self){
|
48
54
|
VALUE string;
|
49
55
|
VALUE form;
|
50
56
|
rb_scan_args(argc, argv, "11", &string, &form);
|
51
57
|
|
52
58
|
if (NIL_P(form)) {
|
53
|
-
return
|
59
|
+
return toNFC(self, string);
|
54
60
|
}
|
55
61
|
|
56
62
|
ID s_form = SYM2ID(form);
|
57
63
|
if (s_form == NFC) {
|
58
|
-
return
|
64
|
+
return toNFC(self, string);
|
59
65
|
}else if(s_form == NFD) {
|
60
|
-
return
|
66
|
+
return toNFD(self, string);
|
61
67
|
}else if(s_form == NFKC) {
|
62
|
-
return
|
68
|
+
return toNFKC(self, string);
|
63
69
|
}else if(s_form == NFKD) {
|
64
|
-
return
|
70
|
+
return toNFKD(self, string);
|
71
|
+
}else if(s_form == NFKC_CF) {
|
72
|
+
return toNFKC_CF(self, string);
|
65
73
|
}else{
|
66
74
|
rb_raise(rb_eRuntimeError, "%s",
|
67
|
-
"Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd] (defaults to :nfc)");
|
75
|
+
"Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] (defaults to :nfc)");
|
68
76
|
}
|
69
77
|
}
|
70
78
|
|
@@ -78,10 +86,12 @@ void Init_utf8_proc(void) {
|
|
78
86
|
NFD = rb_intern("nfd");
|
79
87
|
NFKC = rb_intern("nfkc");
|
80
88
|
NFKD = rb_intern("nfkd");
|
81
|
-
|
82
|
-
|
83
|
-
rb_define_singleton_method(rb_mBase, "
|
84
|
-
rb_define_singleton_method(rb_mBase, "
|
85
|
-
rb_define_singleton_method(rb_mBase, "
|
86
|
-
rb_define_singleton_method(rb_mBase, "
|
89
|
+
NFKC_CF = rb_intern("nfkc_cf");
|
90
|
+
|
91
|
+
rb_define_singleton_method(rb_mBase, "NFC", toNFC, 1);
|
92
|
+
rb_define_singleton_method(rb_mBase, "NFD", toNFD, 1);
|
93
|
+
rb_define_singleton_method(rb_mBase, "NFKC", toNFKC, 1);
|
94
|
+
rb_define_singleton_method(rb_mBase, "NFKD", toNFKD, 1);
|
95
|
+
rb_define_singleton_method(rb_mBase, "NFKC_CF", toNFKC_CF, 1);
|
96
|
+
rb_define_singleton_method(rb_mBase, "normalize", norm, -1);
|
87
97
|
}
|
data/lib/utf8_proc/version.rb
CHANGED
data/utf8_proc.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ["Tim Bellefleur"]
|
11
11
|
spec.email = ["nomoon@phoebus.ca"]
|
12
12
|
|
13
|
-
spec.summary = "
|
13
|
+
spec.summary = "Unicode normalization library using utf8proc"
|
14
14
|
spec.homepage = "https://github.com/nomoon/utf8_proc"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_proc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
@@ -113,7 +113,6 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- bin/console
|
115
115
|
- bin/setup
|
116
|
-
- ext/utf8_proc/.clang_complete
|
117
116
|
- ext/utf8_proc/extconf.rb
|
118
117
|
- ext/utf8_proc/utf8_proc.c
|
119
118
|
- ext/utf8_proc/utf8_proc.h
|
@@ -143,5 +142,5 @@ rubyforge_project:
|
|
143
142
|
rubygems_version: 2.6.10
|
144
143
|
signing_key:
|
145
144
|
specification_version: 4
|
146
|
-
summary:
|
145
|
+
summary: Unicode normalization library using utf8proc
|
147
146
|
test_files: []
|