utf8_proc 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8da1ac5c36921b8730f3a40bdc4311194f18ae54
4
- data.tar.gz: 49011944ab5fbfc868876d33a7da5e043678f286
3
+ metadata.gz: a10b2518012c6465c1365bf5d1f5b74ffe31c1f8
4
+ data.tar.gz: 8d62ad653f961acbe83a0886b1aec36e0c545343
5
5
  SHA512:
6
- metadata.gz: 7ddec0ae393c46c5b5e3464d619a6e2438b1fee300d4f7b15c43cacff7ebfce98ca5641236a5d9a67f2a4631af650a0bf7ab1abf9d8e02b0f9dd529cb1661353
7
- data.tar.gz: 2ad29f62132cdcfef14f527b105d30dd6de67f3816f5b34faff6e1cffa15b728f31fdfd490fc9bf3c66c3b1c9fb870b57cfefedbe19afd8c0ccc8187286487e7
6
+ metadata.gz: 6802fae4cf0b8a3d515541f540f2cc377b6d3c10ac10ea48fa0b7b045169083143f44d77b6bb7b5aebd2d507365e438ca07316a447f8f2563c328c21d6bc2269
7
+ data.tar.gz: 8b7b35123552afd904a2ec05fc92301b9f03876f9721b53e7e95e3bd644dd37241fd98b94808750280b142eb02b7e897b3a00c9357354847ee70657c617d3522
data/README.md CHANGED
@@ -1,8 +1,12 @@
1
1
  # UTF8Proc
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/utf8_proc`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ A simple wrapper around [utf8proc](https://github.com/JuliaLang/utf8proc) for normalizing Unicode strings. Requires the `utf8proc` library and headers to be installed on your system. *(Packages are available. OSX: `brew install utf8proc`, Linux: `libutf8proc-dev` or `utf8proc-devel`)*
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ Currently supports UTF-8/ASCII string input and NFC, NFD, NFKC, NFKD, and NKFC-Casefold forms. Handles Unicode 9.0 and includes the current official full suite of 9.0 normalization tests.
6
+
7
+ Quick benchmarks against the [UNF](https://github.com/knu/ruby-unf) gem show it to be between the same speed (best-case) and ~2x slower (worst-case), averaging about ~1.2x slower on complex Unicode strings. The speed difference is more equal in NFC/NFD modes where mostly or already-normalized strings are used.
8
+
9
+ *(Note: UNF is generally a bit faster but currently officially supports Unicode 6.0 and does not pass all 9.0 normalization tests.)*
6
10
 
7
11
  ## Installation
8
12
 
@@ -22,13 +26,27 @@ Or install it yourself as:
22
26
 
23
27
  ## Usage
24
28
 
25
- TODO: Write usage instructions here
29
+ ```ruby
30
+ require "utf8_proc"
31
+
32
+ # Canonical Decomposition, followed by Canonical Composition
33
+ UTF8Proc.NFC(utf8_string)
26
34
 
27
- ## Development
35
+ # Canonical Decomposition
36
+ UTF8Proc.NFD(utf8_string)
28
37
 
29
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
38
+ # Compatibility Decomposition, followed by Canonical Composition
39
+ UTF8Proc.NFKC(utf8_string)
30
40
 
31
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
41
+ # Compatibility Decomposition
42
+ UTF8Proc.NFKD(utf8_string)
43
+
44
+ # Compatibility Decomposition, followed by Canonical Composition with Case-folding
45
+ UTF8Proc.NFKC_CF(utf8_string)
46
+
47
+ # Second argument may be any of: [:nfc (default), :nfd, :nfkc, :nfkd, :nfkc_cf]
48
+ UTF8Proc.normalize(utf8_string, form = :nfc)
49
+ ```
32
50
 
33
51
  ## Contributing
34
52
 
@@ -7,6 +7,7 @@ ID NFC;
7
7
  ID NFD;
8
8
  ID NFKC;
9
9
  ID NFKD;
10
+ ID NFKC_CF;
10
11
 
11
12
  static inline void checkStrEncoding(VALUE *string) {
12
13
  rb_encoding *enc = rb_enc_get(*string);
@@ -15,7 +16,7 @@ static inline void checkStrEncoding(VALUE *string) {
15
16
  }
16
17
  }
17
18
 
18
- static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
19
+ static inline VALUE normInternal(VALUE string, utf8proc_option_t options) {
19
20
  checkStrEncoding(&string);
20
21
  utf8proc_uint8_t *retval;
21
22
  utf8proc_ssize_t retlen = utf8proc_map(
@@ -28,43 +29,50 @@ static inline VALUE CnormInternal(VALUE string, utf8proc_option_t options) {
28
29
  }
29
30
 
30
31
 
31
- VALUE CtoNFC(VALUE self, VALUE string) {
32
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
32
+ VALUE toNFC(VALUE self, VALUE string) {
33
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
33
34
  }
34
35
 
35
- VALUE CtoNFD(VALUE self, VALUE string) {
36
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
36
+ VALUE toNFD(VALUE self, VALUE string) {
37
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
37
38
  }
38
39
 
39
- VALUE CtoNFKC(VALUE self, VALUE string) {
40
- return CnormInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
40
+ VALUE toNFKC(VALUE self, VALUE string) {
41
+ return normInternal(string,UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
41
42
  }
42
43
 
43
- VALUE CtoNFKD(VALUE self, VALUE string) {
44
- return CnormInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
44
+ VALUE toNFKD(VALUE self, VALUE string) {
45
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
45
46
  }
46
47
 
47
- VALUE Cnorm(int argc, VALUE* argv, VALUE self){
48
+ VALUE toNFKC_CF(VALUE self, VALUE string) {
49
+ return normInternal(string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
50
+ }
51
+
52
+
53
+ VALUE norm(int argc, VALUE* argv, VALUE self){
48
54
  VALUE string;
49
55
  VALUE form;
50
56
  rb_scan_args(argc, argv, "11", &string, &form);
51
57
 
52
58
  if (NIL_P(form)) {
53
- return CtoNFC(self, string);
59
+ return toNFC(self, string);
54
60
  }
55
61
 
56
62
  ID s_form = SYM2ID(form);
57
63
  if (s_form == NFC) {
58
- return CtoNFC(self, string);
64
+ return toNFC(self, string);
59
65
  }else if(s_form == NFD) {
60
- return CtoNFD(self, string);
66
+ return toNFD(self, string);
61
67
  }else if(s_form == NFKC) {
62
- return CtoNFKC(self, string);
68
+ return toNFKC(self, string);
63
69
  }else if(s_form == NFKD) {
64
- return CtoNFKD(self, string);
70
+ return toNFKD(self, string);
71
+ }else if(s_form == NFKC_CF) {
72
+ return toNFKC_CF(self, string);
65
73
  }else{
66
74
  rb_raise(rb_eRuntimeError, "%s",
67
- "Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd] (defaults to :nfc)");
75
+ "Second optional argument must be one of [:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf] (defaults to :nfc)");
68
76
  }
69
77
  }
70
78
 
@@ -78,10 +86,12 @@ void Init_utf8_proc(void) {
78
86
  NFD = rb_intern("nfd");
79
87
  NFKC = rb_intern("nfkc");
80
88
  NFKD = rb_intern("nfkd");
81
-
82
- rb_define_singleton_method(rb_mBase, "to_NFC", CtoNFC, 1);
83
- rb_define_singleton_method(rb_mBase, "to_NFD", CtoNFD, 1);
84
- rb_define_singleton_method(rb_mBase, "to_NFKC", CtoNFKC, 1);
85
- rb_define_singleton_method(rb_mBase, "to_NFKD", CtoNFKD, 1);
86
- rb_define_singleton_method(rb_mBase, "normalize", Cnorm, -1);
89
+ NFKC_CF = rb_intern("nfkc_cf");
90
+
91
+ rb_define_singleton_method(rb_mBase, "NFC", toNFC, 1);
92
+ rb_define_singleton_method(rb_mBase, "NFD", toNFD, 1);
93
+ rb_define_singleton_method(rb_mBase, "NFKC", toNFKC, 1);
94
+ rb_define_singleton_method(rb_mBase, "NFKD", toNFKD, 1);
95
+ rb_define_singleton_method(rb_mBase, "NFKC_CF", toNFKC_CF, 1);
96
+ rb_define_singleton_method(rb_mBase, "normalize", norm, -1);
87
97
  }
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module UTF8Proc
3
- VERSION = "0.1.0"
3
+ VERSION = "0.2.0"
4
4
  end
data/utf8_proc.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["Tim Bellefleur"]
11
11
  spec.email = ["nomoon@phoebus.ca"]
12
12
 
13
- spec.summary = "Ruby Unicode library using utf8proc"
13
+ spec.summary = "Unicode normalization library using utf8proc"
14
14
  spec.homepage = "https://github.com/nomoon/utf8_proc"
15
15
  spec.license = "MIT"
16
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8_proc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Bellefleur
@@ -113,7 +113,6 @@ files:
113
113
  - Rakefile
114
114
  - bin/console
115
115
  - bin/setup
116
- - ext/utf8_proc/.clang_complete
117
116
  - ext/utf8_proc/extconf.rb
118
117
  - ext/utf8_proc/utf8_proc.c
119
118
  - ext/utf8_proc/utf8_proc.h
@@ -143,5 +142,5 @@ rubyforge_project:
143
142
  rubygems_version: 2.6.10
144
143
  signing_key:
145
144
  specification_version: 4
146
- summary: Ruby Unicode library using utf8proc
145
+ summary: Unicode normalization library using utf8proc
147
146
  test_files: []
@@ -1,2 +0,0 @@
1
- -I/Users/nomoon/.rvm/rubies/ruby-2.4.0/include/ruby-2.4.0/x86_64-darwin16
2
- -I/Users/nomoon/.rvm/rubies/ruby-2.4.0/include/ruby-2.4.0