utf8_proc 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +24 -6
- data/ext/utf8_proc/utf8_proc.c +32 -22
- data/lib/utf8_proc/version.rb +1 -1
- data/utf8_proc.gemspec +1 -1
- metadata +2 -3
- data/ext/utf8_proc/.clang_complete +0 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a10b2518012c6465c1365bf5d1f5b74ffe31c1f8
|
4
|
+
data.tar.gz: 8d62ad653f961acbe83a0886b1aec36e0c545343
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6802fae4cf0b8a3d515541f540f2cc377b6d3c10ac10ea48fa0b7b045169083143f44d77b6bb7b5aebd2d507365e438ca07316a447f8f2563c328c21d6bc2269
|
7
|
+
data.tar.gz: 8b7b35123552afd904a2ec05fc92301b9f03876f9721b53e7e95e3bd644dd37241fd98b94808750280b142eb02b7e897b3a00c9357354847ee70657c617d3522
|
data/README.md
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
# UTF8Proc
|
2
2
|
|
3
|
-
|
3
|
+
A simple wrapper around [utf8proc](https://github.com/JuliaLang/utf8proc) for normalizing Unicode strings. Requires the `utf8proc` library and headers to be installed on your system. *(Packages are available. OSX: `brew install utf8proc`, Linux: `libutf8proc-dev` or `utf8proc-devel`)*
|
4
4
|
|
5
|
-
|
5
|
+
Currently supports UTF-8/ASCII string input and NFC, NFD, NFKC, NFKD, and NKFC-Casefold forms. Handles Unicode 9.0 and includes the current official full suite of 9.0 normalization tests.
|
6
|
+
|
7
|
+
Quick benchmarks against the [UNF](https://github.com/knu/ruby-unf) gem show it to be between the same speed (best-case) and ~2x slower (worst-case), averaging about ~1.2x slower on complex Unicode strings. The speed difference is more equal in NFC/NFD modes where mostly or already-normalized strings are used.
|
8
|
+
|
9
|
+
*(Note: UNF is generally a bit faster but currently officially supports Unicode 6.0 and does not pass all 9.0 normalization tests.)*
|
6
10
|
|
7
11
|
## Installation
|
8
12
|
|
@@ -22,13 +26,27 @@ Or install it yourself as:
|
|
22
26
|
|
23
27
|
## Usage
|
24
28
|
|
25
|
-
|
29
|
+
```ruby
|
30
|
+
require "utf8_proc"
|
31
|
+
|
32
|
+
# Canonical Decomposition, followed by Canonical Composition
|
33
|
+
UTF8Proc.NFC(utf8_string)
|
26
34
|
|
27
|
-
|
35
|
+
# Canonical Decomposition
|
36
|
+
UTF8Proc.NFD(utf8_string)
|
28
37
|
|
29
|
-
|
38
|
+
# Compatibility Decomposition, followed by Canonical Composition
|
39
|
+
UTF8Proc.NFKC(utf8_string)
|
30
40
|
|
31
|
-
|
41
|
+
# Compatibility Decomposition
|
42
|
+
UTF8Proc.NFKD(utf8_string)
|
43
|
+
|
44
|
+
# Compatibility Decomposition, followed by Canonical Composition with Case-folding
|
45
|
+
UTF8Proc.NFKC_CF(utf8_string)
|
46
|
+
|
47
|
+
# Second argument may be any of: [:nfc (default), :nfd, :nfkc, :nfkd, :nfkc_cf]
|
48
|
+
UTF8Proc.normalize(utf8_string, form = :nfc)
|
49
|
+
```
|
32
50
|
|
33
51
|
## Contributing
|
34
52
|
|
data/ext/utf8_proc/utf8_proc.c
CHANGED
@@ -7,6 +7,7 @@ ID NFC;
|
|
7
7
|
ID NFD;
|
8
8
|
ID NFKC;
|
9
9
|
ID NFKD;
|
10
|
+
ID NFKC_CF;
|
10
11
|
|
11
12
|
static inline void checkStrEncoding(VALUE *string) {
|
12
13
|
rb_encoding *enc = rb_enc_get(*string);
|
@@ -15,7 +16,7 @@ static inline void checkStrEncoding(VALUE *string) {
|
|
15
16
|
}
|
16
17
|
}
|
17
18
|
|
18
|
-
static inline VALUE
|
19
|
+
static inline VALUE normInternal(VALUE string, utf8proc_option_t options) {
|
19
20
|
checkStrEncoding(&string);
|
20
21
|
utf8proc_uint8_t *retval;
|
21
22
|
utf8proc_ssize_t retlen = utf8proc_map(
|
@@ -28,43 +29,50 @@ static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
|
|
28
29
|
}
|
29
30
|
|
30
31
|
|
31
|
-
VALUE
|
32
|
-
return
|
32
|
+
VALUE toNFC(VALUE self, VALUE string) {
|
33
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
33
34
|
}
|
34
35
|
|
35
|
-
VALUE
|
36
|
-
return
|
36
|
+
VALUE toNFD(VALUE self, VALUE string) {
|
37
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
|
37
38
|
}
|
38
39
|
|
39
|
-
VALUE
|
40
|
-
return
|
40
|
+
VALUE toNFKC(VALUE self, VALUE string) {
|
41
|
+
return normInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
41
42
|
}
|
42
43
|
|
43
|
-
VALUE
|
44
|
-
return
|
44
|
+
VALUE toNFKD(VALUE self, VALUE string) {
|
45
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
45
46
|
}
|
46
47
|
|
47
|
-
VALUE
|
48
|
+
VALUE toNFKC_CF(VALUE self, VALUE string) {
|
49
|
+
return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
|
50
|
+
}
|
51
|
+
|
52
|
+
|
53
|
+
VALUE norm(int argc, VALUE* argv, VALUE self){
|
48
54
|
VALUE string;
|
49
55
|
VALUE form;
|
50
56
|
rb_scan_args(argc, argv, "11", &string, &form);
|
51
57
|
|
52
58
|
if (NIL_P(form)) {
|
53
|
-
return
|
59
|
+
return toNFC(self, string);
|
54
60
|
}
|
55
61
|
|
56
62
|
ID s_form = SYM2ID(form);
|
57
63
|
if (s_form == NFC) {
|
58
|
-
return
|
64
|
+
return toNFC(self, string);
|
59
65
|
}else if(s_form == NFD) {
|
60
|
-
return
|
66
|
+
return toNFD(self, string);
|
61
67
|
}else if(s_form == NFKC) {
|
62
|
-
return
|
68
|
+
return toNFKC(self, string);
|
63
69
|
}else if(s_form == NFKD) {
|
64
|
-
return
|
70
|
+
return toNFKD(self, string);
|
71
|
+
}else if(s_form == NFKC_CF) {
|
72
|
+
return toNFKC_CF(self, string);
|
65
73
|
}else{
|
66
74
|
rb_raise(rb_eRuntimeError, "%s",
|
67
|
-
"Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd] (defaults to :nfc)");
|
75
|
+
"Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] (defaults to :nfc)");
|
68
76
|
}
|
69
77
|
}
|
70
78
|
|
@@ -78,10 +86,12 @@ void Init_utf8_proc(void) {
|
|
78
86
|
NFD = rb_intern("nfd");
|
79
87
|
NFKC = rb_intern("nfkc");
|
80
88
|
NFKD = rb_intern("nfkd");
|
81
|
-
|
82
|
-
|
83
|
-
rb_define_singleton_method(rb_mBase, "
|
84
|
-
rb_define_singleton_method(rb_mBase, "
|
85
|
-
rb_define_singleton_method(rb_mBase, "
|
86
|
-
rb_define_singleton_method(rb_mBase, "
|
89
|
+
NFKC_CF = rb_intern("nfkc_cf");
|
90
|
+
|
91
|
+
rb_define_singleton_method(rb_mBase, "NFC", toNFC, 1);
|
92
|
+
rb_define_singleton_method(rb_mBase, "NFD", toNFD, 1);
|
93
|
+
rb_define_singleton_method(rb_mBase, "NFKC", toNFKC, 1);
|
94
|
+
rb_define_singleton_method(rb_mBase, "NFKD", toNFKD, 1);
|
95
|
+
rb_define_singleton_method(rb_mBase, "NFKC_CF", toNFKC_CF, 1);
|
96
|
+
rb_define_singleton_method(rb_mBase, "normalize", norm, -1);
|
87
97
|
}
|
data/lib/utf8_proc/version.rb
CHANGED
data/utf8_proc.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ["Tim Bellefleur"]
|
11
11
|
spec.email = ["nomoon@phoebus.ca"]
|
12
12
|
|
13
|
-
spec.summary = "
|
13
|
+
spec.summary = "Unicode normalization library using utf8proc"
|
14
14
|
spec.homepage = "https://github.com/nomoon/utf8_proc"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8_proc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
@@ -113,7 +113,6 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- bin/console
|
115
115
|
- bin/setup
|
116
|
-
- ext/utf8_proc/.clang_complete
|
117
116
|
- ext/utf8_proc/extconf.rb
|
118
117
|
- ext/utf8_proc/utf8_proc.c
|
119
118
|
- ext/utf8_proc/utf8_proc.h
|
@@ -143,5 +142,5 @@ rubyforge_project:
|
|
143
142
|
rubygems_version: 2.6.10
|
144
143
|
signing_key:
|
145
144
|
specification_version: 4
|
146
|
-
summary:
|
145
|
+
summary: Unicode normalization library using utf8proc
|
147
146
|
test_files: []
|