utf8_proc 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8da1ac5c36921b8730f3a40bdc4311194f18ae54
4
- data.tar.gz: 49011944ab5fbfc868876d33a7da5e043678f286
3
+ metadata.gz: a10b2518012c6465c1365bf5d1f5b74ffe31c1f8
4
+ data.tar.gz: 8d62ad653f961acbe83a0886b1aec36e0c545343
5
5
  SHA512:
6
- metadata.gz: 7ddec0ae393c46c5b5e3464d619a6e2438b1fee300d4f7b15c43cacff7ebfce98ca5641236a5d9a67f2a4631af650a0bf7ab1abf9d8e02b0f9dd529cb1661353
7
- data.tar.gz: 2ad29f62132cdcfef14f527b105d30dd6de67f3816f5b34faff6e1cffa15b728f31fdfd490fc9bf3c66c3b1c9fb870b57cfefedbe19afd8c0ccc8187286487e7
6
+ metadata.gz: 6802fae4cf0b8a3d515541f540f2cc377b6d3c10ac10ea48fa0b7b045169083143f44d77b6bb7b5aebd2d507365e438ca07316a447f8f2563c328c21d6bc2269
7
+ data.tar.gz: 8b7b35123552afd904a2ec05fc92301b9f03876f9721b53e7e95e3bd644dd37241fd98b94808750280b142eb02b7e897b3a00c9357354847ee70657c617d3522
data/README.md CHANGED
@@ -1,8 +1,12 @@
1
1
  # UTF8Proc
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/utf8_proc`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ A simple wrapper around [utf8proc](https://github.com/JuliaLang/utf8proc) for normalizing Unicode strings. Requires the `utf8proc` library and headers to be installed on your system. *(Packages are available. OSX: `brew install utf8proc`, Linux: `libutf8proc-dev` or `utf8proc-devel`)*
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ Currently supports UTF-8/ASCII string input and NFC, NFD, NFKC, NFKD, and NKFC-Casefold forms. Handles Unicode 9.0 and includes the current official full suite of 9.0 normalization tests.
6
+
7
+ Quick benchmarks against the [UNF](https://github.com/knu/ruby-unf) gem show it to be between the same speed (best-case) and ~2x slower (worst-case), averaging about ~1.2x slower on complex Unicode strings. The speed difference is more equal in NFC/NFD modes where mostly or already-normalized strings are used.
8
+
9
+ *(Note: UNF is generally a bit faster but currently officially supports Unicode 6.0 and does not pass all 9.0 normalization tests.)*
6
10
 
7
11
  ## Installation
8
12
 
@@ -22,13 +26,27 @@ Or install it yourself as:
22
26
 
23
27
  ## Usage
24
28
 
25
- TODO: Write usage instructions here
29
+ ```ruby
30
+ require "utf8_proc"
31
+
32
+ # Canonical Decomposition, followed by Canonical Composition
33
+ UTF8Proc.NFC(utf8_string)
26
34
 
27
- ## Development
35
+ # Canonical Decomposition
36
+ UTF8Proc.NFD(utf8_string)
28
37
 
29
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
38
+ # Compatibility Decomposition, followed by Canonical Composition
39
+ UTF8Proc.NFKC(utf8_string)
30
40
 
31
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
41
+ # Compatibility Decomposition
42
+ UTF8Proc.NFKD(utf8_string)
43
+
44
+ # Compatibility Decomposition, followed by Canonical Composition with Case-folding
45
+ UTF8Proc.NFKC_CF(utf8_string)
46
+
47
+ # Second argument may be any of: [:nfc (default), :nfd, :nfkc, :nfkd, :nfkc_cf]
48
+ UTF8Proc.normalize(utf8_string, form = :nfc)
49
+ ```
32
50
 
33
51
  ## Contributing
34
52
 
@@ -7,6 +7,7 @@ ID NFC;
7
7
  ID NFD;
8
8
  ID NFKC;
9
9
  ID NFKD;
10
+ ID NFKC_CF;
10
11
 
11
12
  static inline void checkStrEncoding(VALUE *string) {
12
13
  rb_encoding *enc = rb_enc_get(*string);
@@ -15,7 +16,7 @@ static inline void checkStrEncoding(VALUE *string) {
15
16
  }
16
17
  }
17
18
 
18
- static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
19
+ static inline VALUE normInternal(VALUE string, utf8proc_option_t options) {
19
20
  checkStrEncoding(&string);
20
21
  utf8proc_uint8_t *retval;
21
22
  utf8proc_ssize_t retlen = utf8proc_map(
@@ -28,43 +29,50 @@ static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
28
29
  }
29
30
 
30
31
 
31
- VALUE CtoNFC(VALUE self, VALUE string) {
32
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
32
+ VALUE toNFC(VALUE self, VALUE string) {
33
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
33
34
  }
34
35
 
35
- VALUE CtoNFD(VALUE self, VALUE string) {
36
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
36
+ VALUE toNFD(VALUE self, VALUE string) {
37
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
37
38
  }
38
39
 
39
- VALUE CtoNFKC(VALUE self, VALUE string) {
40
- return CnormInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
40
+ VALUE toNFKC(VALUE self, VALUE string) {
41
+ return normInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
41
42
  }
42
43
 
43
- VALUE CtoNFKD(VALUE self, VALUE string) {
44
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
44
+ VALUE toNFKD(VALUE self, VALUE string) {
45
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
45
46
  }
46
47
 
47
- VALUE Cnorm(int argc, VALUE* argv, VALUE self){
48
+ VALUE toNFKC_CF(VALUE self, VALUE string) {
49
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
50
+ }
51
+
52
+
53
+ VALUE norm(int argc, VALUE* argv, VALUE self){
48
54
  VALUE string;
49
55
  VALUE form;
50
56
  rb_scan_args(argc, argv, "11", &string, &form);
51
57
 
52
58
  if (NIL_P(form)) {
53
- return CtoNFC(self, string);
59
+ return toNFC(self, string);
54
60
  }
55
61
 
56
62
  ID s_form = SYM2ID(form);
57
63
  if (s_form == NFC) {
58
- return CtoNFC(self, string);
64
+ return toNFC(self, string);
59
65
  }else if(s_form == NFD) {
60
- return CtoNFD(self, string);
66
+ return toNFD(self, string);
61
67
  }else if(s_form == NFKC) {
62
- return CtoNFKC(self, string);
68
+ return toNFKC(self, string);
63
69
  }else if(s_form == NFKD) {
64
- return CtoNFKD(self, string);
70
+ return toNFKD(self, string);
71
+ }else if(s_form == NFKC_CF) {
72
+ return toNFKC_CF(self, string);
65
73
  }else{
66
74
  rb_raise(rb_eRuntimeError, "%s",
67
- "Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd] (defaults to :nfc)");
75
+ "Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] (defaults to :nfc)");
68
76
  }
69
77
  }
70
78
 
@@ -78,10 +86,12 @@ void Init_utf8_proc(void) {
78
86
  NFD = rb_intern("nfd");
79
87
  NFKC = rb_intern("nfkc");
80
88
  NFKD = rb_intern("nfkd");
81
-
82
- rb_define_singleton_method(rb_mBase, "to_NFC", CtoNFC, 1);
83
- rb_define_singleton_method(rb_mBase, "to_NFD", CtoNFD, 1);
84
- rb_define_singleton_method(rb_mBase, "to_NFKC", CtoNFKC, 1);
85
- rb_define_singleton_method(rb_mBase, "to_NFKD", CtoNFKD, 1);
86
- rb_define_singleton_method(rb_mBase, "normalize", Cnorm, -1);
89
+ NFKC_CF = rb_intern("nfkc_cf");
90
+
91
+ rb_define_singleton_method(rb_mBase, "NFC", toNFC, 1);
92
+ rb_define_singleton_method(rb_mBase, "NFD", toNFD, 1);
93
+ rb_define_singleton_method(rb_mBase, "NFKC", toNFKC, 1);
94
+ rb_define_singleton_method(rb_mBase, "NFKD", toNFKD, 1);
95
+ rb_define_singleton_method(rb_mBase, "NFKC_CF", toNFKC_CF, 1);
96
+ rb_define_singleton_method(rb_mBase, "normalize", norm, -1);
87
97
  }
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module UTF8Proc
3
- VERSION = "0.1.0"
3
+ VERSION = "0.2.0"
4
4
  end
data/utf8_proc.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["Tim Bellefleur"]
11
11
  spec.email = ["nomoon@phoebus.ca"]
12
12
 
13
- spec.summary = "Ruby Unicode library using utf8proc"
13
+ spec.summary = "Unicode normalization library using utf8proc"
14
14
  spec.homepage = "https://github.com/nomoon/utf8_proc"
15
15
  spec.license = "MIT"
16
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8_proc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Bellefleur
@@ -113,7 +113,6 @@ files:
113
113
  - Rakefile
114
114
  - bin/console
115
115
  - bin/setup
116
- - ext/utf8_proc/.clang_complete
117
116
  - ext/utf8_proc/extconf.rb
118
117
  - ext/utf8_proc/utf8_proc.c
119
118
  - ext/utf8_proc/utf8_proc.h
@@ -143,5 +142,5 @@ rubyforge_project:
143
142
  rubygems_version: 2.6.10
144
143
  signing_key:
145
144
  specification_version: 4
146
- summary: Ruby Unicode library using utf8proc
145
+ summary: Unicode normalization library using utf8proc
147
146
  test_files: []
@@ -1,2 +0,0 @@
1
- -I/Users/nomoon/.rvm/rubies/ruby-2.4.0/include/ruby-2.4.0/x86_64-darwin16
2
- -I/Users/nomoon/.rvm/rubies/ruby-2.4.0/include/ruby-2.4.0