unf_ext 0.0.8.2.beta-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ #ifndef UNF_TRIE_CHAR_STREAM_HH
2
+ #define UNF_TRIE_CHAR_STREAM_HH
3
+
4
+ #include <vector>
5
+ #include <string>
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class CharStream {
11
+ public:
12
+ CharStream(const char* str) : cur_(str) {}
13
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
14
+ unsigned char prev() const { return cur_[-1]; }
15
+ unsigned char peek() const { return *cur_; }
16
+ const char* cur() const { return cur_; }
17
+ bool eos() const { return *cur_ == '\0'; }
18
+ void setCur(const char* new_cur) { cur_ = new_cur; }
19
+
20
+ private:
21
+ const char* cur_;
22
+ };
23
+
24
+ class RangeCharStream {
25
+ public:
26
+ RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
27
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
28
+ unsigned char prev() const { return cur_[-1]; }
29
+ unsigned char peek() const { return *cur_; }
30
+ const char* cur() const { return cur_; }
31
+ const char* end() const { return end_; }
32
+ bool eos() const { return cur_ == end_; }
33
+
34
+ private:
35
+ const char* cur_;
36
+ const char* end_;
37
+ };
38
+
39
+ class CompoundCharStream {
40
+ public:
41
+ CompoundCharStream(const char* first, const char* second)
42
+ : beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
43
+
44
+ unsigned char read() { return !eos1() ? read1() : read2(); }
45
+ unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
46
+ unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
47
+
48
+ const char* cur() const { return !eos1() ? cur1 : cur2; }
49
+ bool eos() const { return eos1() && eos2(); }
50
+ bool within_first() const { return !eos1(); }
51
+
52
+ unsigned offset() const { return cur1-beg1 + cur2-beg2; }
53
+ void setCur(const char* p) {
54
+ if(beg1 <= p && p <= cur1) {
55
+ cur1=p;
56
+ cur2=beg2;
57
+ } else {
58
+ cur2=p;
59
+ }
60
+ }
61
+
62
+ protected:
63
+ unsigned char read1() { return eos1() ? '\0' : *cur1++; }
64
+ unsigned char read2() { return eos2() ? '\0' : *cur2++; }
65
+ bool eos1() const { return *cur1=='\0'; }
66
+ bool eos2() const { return *cur2=='\0'; }
67
+
68
+ protected:
69
+ const char* beg1;
70
+ const char* beg2;
71
+ const char* cur1;
72
+ const char* cur2;
73
+ };
74
+
75
+ class CharStreamForComposition : public CompoundCharStream {
76
+ public:
77
+ CharStreamForComposition (const char* first, const char* second,
78
+ const std::vector<unsigned char>& canonical_classes,
79
+ std::string& buf)
80
+ : CompoundCharStream(first, second), classes(canonical_classes), skipped(buf)
81
+ {}
82
+
83
+ void init_skipinfo() {
84
+ skipped.clear();
85
+ skipped_tail = 0;
86
+ }
87
+
88
+ void mark_as_last_valid_point() {
89
+ skipped_tail = skipped.size();
90
+ marked_point = cur();
91
+ }
92
+
93
+ void reset_at_marked_point() {
94
+ setCur(marked_point);
95
+ }
96
+
97
+ void append_read_char_to_str(std::string& s, const char* beg) const {
98
+ if(eos1()==false) {
99
+ s.append(beg, cur());
100
+ } else {
101
+ s.append(beg, cur1);
102
+ s.append(beg2, cur());
103
+ }
104
+ }
105
+
106
+ void append_skipped_chars_to_str(std::string& s) const {
107
+ s.append(skipped.begin(), skipped.begin()+skipped_tail);
108
+ }
109
+
110
+ unsigned char get_canonical_class() const {
111
+ return offset() < classes.size() ? classes[offset()] : 0;
112
+ }
113
+
114
+ bool next_combining_char(unsigned char prev_class, const char* ppp) {
115
+ while(Util::is_utf8_char_start_byte(peek()) == false)
116
+ read();
117
+
118
+ unsigned char mid_class = get_prev_canonical_class();
119
+ unsigned char cur_class = get_canonical_class();
120
+
121
+ if(prev_class==0 && mid_class==0 && cur_class!=0)
122
+ return false;
123
+
124
+ if(prev_class < cur_class && mid_class < cur_class) {
125
+ skipped.append(ppp, cur());
126
+ return true;
127
+ } else {
128
+ if(cur_class != 0) {
129
+ read();
130
+ return next_combining_char(prev_class,ppp);
131
+ }
132
+ return false;
133
+ }
134
+ }
135
+
136
+ private:
137
+ unsigned char get_prev_canonical_class() const {
138
+ return offset()-1 < classes.size() ? classes[offset()-1] : 0;
139
+ }
140
+
141
+ private:
142
+ const std::vector<unsigned char>& classes;
143
+ std::string& skipped;
144
+ unsigned skipped_tail;
145
+ const char* marked_point;
146
+ };
147
+ }
148
+ }
149
+
150
+ #endif
@@ -0,0 +1,25 @@
1
+ #ifndef UNF_TRIE_NODE_HH
2
+ #define UNF_TRIE_NODE_HH
3
+
4
+ namespace UNF {
5
+ namespace Trie {
6
+ class Node {
7
+ public:
8
+ unsigned jump(unsigned char ch) const { return base() + ch; }
9
+ unsigned value() const { return base(); }
10
+ unsigned check_char() const { return data>>24; }
11
+ unsigned to_uint() const { return data; }
12
+
13
+ static const Node* from_uint_array(const unsigned* node_uints)
14
+ { return reinterpret_cast<const Node*>(node_uints); }
15
+
16
+ private:
17
+ unsigned base() const { return data & 0xFFFFFF; }
18
+
19
+ private:
20
+ unsigned data;
21
+ };
22
+ }
23
+ }
24
+
25
+ #endif
@@ -0,0 +1,194 @@
1
+ #ifndef UNF_TRIE_SEARCHER_HH
2
+ #define UNF_TRIE_SEARCHER_HH
3
+
4
+ #include "char_stream.hh"
5
+ #include "node.hh"
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class Searcher {
11
+ public:
12
+ Searcher(const Node* nodes, unsigned root, const char* value=NULL)
13
+ : nodes(nodes), root(root), value(value) {}
14
+
15
+ unsigned find_value(const char* key, int default_value) const {
16
+ unsigned node_index=root;
17
+ for(CharStream in(key);; in.read()) {
18
+ node_index = nodes[node_index].jump(in.peek());
19
+ if(nodes[node_index].check_char()==in.peek()) {
20
+ unsigned terminal_index = nodes[node_index].jump('\0');
21
+ if(nodes[terminal_index].check_char()=='\0') {
22
+ return nodes[terminal_index].value();
23
+ }
24
+ } else
25
+ return default_value;
26
+ }
27
+ }
28
+
29
+ protected:
30
+ const Node* nodes;
31
+ const unsigned root;
32
+ const char* value;
33
+ };
34
+
35
+ class CanonicalCombiningClass : private Searcher {
36
+ public:
37
+ CanonicalCombiningClass(const unsigned* node_uints, unsigned root)
38
+ : Searcher(Node::from_uint_array(node_uints), root) {}
39
+
40
+ unsigned get_class(const char* str) const { return find_value(str,0); }
41
+
42
+ void sort(char* str, std::vector<unsigned char>& classes) const {
43
+ CharStream in(str);
44
+ unsigned sort_beg=0;
45
+ unsigned sort_end=0;
46
+ unsigned unicode_char_count=0;
47
+
48
+ loop_head:
49
+ unsigned beg = in.cur()-str;
50
+
51
+ for(unsigned node_index=root;;){
52
+ node_index = nodes[node_index].jump(in.read());
53
+
54
+ if(nodes[node_index].check_char()==in.prev()) {
55
+ unsigned terminal_index = nodes[node_index].jump('\0');
56
+ if(nodes[terminal_index].check_char()=='\0') {
57
+ if((unicode_char_count++)==0)
58
+ sort_beg = beg;
59
+ sort_end = in.cur()-str;
60
+
61
+ unsigned char klass = nodes[terminal_index].value();
62
+ for(unsigned i=beg; i < sort_end; i++)
63
+ classes[i] = klass;
64
+ break;
65
+ }
66
+ } else {
67
+ if(unicode_char_count > 1)
68
+ bubble_sort(str, classes, sort_beg, sort_end);
69
+ unicode_char_count = 0;
70
+ break;
71
+ }
72
+ }
73
+ Util::eat_until_utf8_char_start_point(in);
74
+
75
+ if(in.eos()==false)
76
+ goto loop_head;
77
+
78
+ if(unicode_char_count > 1)
79
+ bubble_sort(str, classes, sort_beg, sort_end);
80
+ }
81
+
82
+ private:
83
+ void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
84
+ for(unsigned limit=beg, next=end; limit != next;) {
85
+ limit = next;
86
+ for(unsigned i=beg+1; i < limit; i++)
87
+ if(canonical_classes[i-1] > canonical_classes[i]) {
88
+ std::swap(canonical_classes[i-1], canonical_classes[i]);
89
+ std::swap(str[i-1], str[i]);
90
+ next = i;
91
+ }
92
+ }
93
+ }
94
+ };
95
+
96
+ class NormalizationForm : private Searcher {
97
+ public:
98
+ NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
99
+ : Searcher(Node::from_uint_array(node_uints), root, value) {}
100
+
101
+ bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
102
+
103
+ void decompose(RangeCharStream in, std::string& buffer) const {
104
+ loop_head:
105
+ const char* beg = in.cur();
106
+
107
+ for(unsigned node_index=root;;) {
108
+ node_index = nodes[node_index].jump(in.read());
109
+ if(nodes[node_index].check_char()==in.prev()) {
110
+ unsigned terminal_index = nodes[node_index].jump('\0');
111
+ if(nodes[terminal_index].check_char()=='\0') {
112
+ word_append(buffer, value, nodes[terminal_index].value());
113
+ beg = in.cur();
114
+ break;
115
+ }
116
+ } else {
117
+ Util::eat_until_utf8_char_start_point(in);
118
+ buffer.append(beg, in.cur());
119
+ break;
120
+ }
121
+ }
122
+
123
+ if(in.eos()==false)
124
+ goto loop_head;
125
+ }
126
+
127
+ void compose(CharStreamForComposition& in, std::string& buf) const {
128
+ in.init_skipinfo();
129
+
130
+ const char* const beg = in.cur();
131
+ const char* current_char_head = in.cur();
132
+ unsigned composed_char_info = 0;
133
+
134
+ unsigned node_index = root;
135
+ unsigned retry_root_node = root;
136
+ unsigned char retry_root_class = 0;
137
+
138
+ for(bool first=true;;) {
139
+ if(Util::is_utf8_char_start_byte(in.peek())) {
140
+ if(node_index != root)
141
+ first=false;
142
+ current_char_head = in.cur();
143
+
144
+ retry_root_node = node_index;
145
+ retry_root_class = in.get_canonical_class();
146
+ }
147
+
148
+ retry:
149
+ unsigned next_index = nodes[node_index].jump(in.peek());
150
+ if(nodes[next_index].check_char()==in.read()) {
151
+ // succeeded
152
+ node_index = next_index;
153
+ unsigned terminal_index = nodes[node_index].jump('\0');
154
+ if(nodes[terminal_index].check_char()=='\0') {
155
+ composed_char_info = nodes[terminal_index].value();
156
+
157
+ in.mark_as_last_valid_point();
158
+ if(in.eos() || retry_root_class > in.get_canonical_class())
159
+ break;
160
+ }
161
+ } else if (first==true) {
162
+ // no retry if current point is a part of first starter
163
+ break;
164
+ } else if (in.next_combining_char(retry_root_class, current_char_head)==true) {
165
+ // back previous code-point and retry
166
+ node_index = retry_root_node;
167
+ current_char_head = in.cur();
168
+ goto retry;
169
+ } else {
170
+ break;
171
+ }
172
+ }
173
+
174
+ if(composed_char_info != 0) {
175
+ // append composed unicode-character and skipped combining-characters
176
+ word_append(buf, value, composed_char_info);
177
+ in.append_skipped_chars_to_str(buf);
178
+ in.reset_at_marked_point();
179
+ } else {
180
+ // append one unicode-character
181
+ in.setCur(Util::nearest_utf8_char_start_point(beg+1));
182
+ in.append_read_char_to_str(buf, beg);
183
+ }
184
+ }
185
+
186
+ private:
187
+ static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
188
+ buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
189
+ }
190
+ };
191
+ }
192
+ }
193
+
194
+ #endif
@@ -0,0 +1,24 @@
1
+ #ifndef UNF_UTIL_HH
2
+ #define UNF_UTIL_HH
3
+
4
+ namespace UNF {
5
+ namespace Util {
6
+ inline bool is_utf8_char_start_byte(char byte) {
7
+ if(!(byte&0x80)) return true; // ascii
8
+ else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
9
+ return false;
10
+ }
11
+
12
+ inline const char* nearest_utf8_char_start_point(const char* s) {
13
+ for(; is_utf8_char_start_byte(*s)==false; s++);
14
+ return s;
15
+ }
16
+
17
+ template <class CharStream>
18
+ inline void eat_until_utf8_char_start_point(CharStream& in) {
19
+ for(; is_utf8_char_start_byte(in.peek())==false; in.read());
20
+ }
21
+ }
22
+ }
23
+
24
+ #endif
@@ -0,0 +1,75 @@
1
+ #include "unf/normalizer.hh"
2
+
3
+ #include <ruby.h>
4
+ #if defined(HAVE_RUBY_ENCODING_H)
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ extern "C" {
9
+ VALUE unf_allocate(VALUE klass);
10
+ VALUE unf_initialize(VALUE self);
11
+ void unf_delete(UNF::Normalizer* ptr);
12
+ VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form);
13
+
14
+ ID FORM_NFD;
15
+ ID FORM_NFC;
16
+ ID FORM_NFKD;
17
+ ID FORM_NFKC;
18
+
19
+ void Init_unf_ext() {
20
+ VALUE mdl = rb_define_module("UNF");
21
+
22
+ VALUE cls = rb_define_class_under(mdl, "Normalizer", rb_cObject);
23
+ rb_define_alloc_func(cls, unf_allocate);
24
+ rb_define_method(cls, "initialize", (VALUE (*)(...))unf_initialize, 0);
25
+ rb_define_method(cls, "normalize", (VALUE (*)(...))unf_normalize, 2);
26
+
27
+ FORM_NFD = rb_intern("nfd");
28
+ FORM_NFC = rb_intern("nfc");
29
+ FORM_NFKD= rb_intern("nfkd");
30
+ FORM_NFKC= rb_intern("nfkc");
31
+ }
32
+
33
+
34
+ VALUE unf_allocate(VALUE klass) {
35
+ UNF::Normalizer* ptr;
36
+ VALUE obj = Data_Make_Struct(klass, UNF::Normalizer, NULL, unf_delete, ptr);
37
+ new ((void*)ptr) UNF::Normalizer;
38
+ return obj;
39
+ }
40
+
41
+ VALUE unf_initialize(VALUE self) {
42
+ return self;
43
+ }
44
+
45
+ void unf_delete(UNF::Normalizer* ptr) {
46
+ ptr->~Normalizer();
47
+ ruby_xfree(ptr);
48
+ }
49
+
50
+ VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form) {
51
+ UNF::Normalizer* ptr;
52
+ Data_Get_Struct(self, UNF::Normalizer, ptr);
53
+
54
+ const char* src = StringValueCStr(source);
55
+ const char* rlt;
56
+ ID form_id = SYM2ID(normalization_form);
57
+
58
+ if(form_id == FORM_NFD)
59
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFD);
60
+ else if(form_id == FORM_NFC)
61
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFC);
62
+ else if(form_id == FORM_NFKD)
63
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKD);
64
+ else if(form_id == FORM_NFKC)
65
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKC);
66
+ else
67
+ rb_raise(rb_eArgError, "Specified Normalization-Form is unknown. Please select one from among :nfc, :nfd, :nfkc, :nfkd.");
68
+
69
+ #if defined(HAVE_RUBY_ENCODING_H)
70
+ return rb_enc_str_new(rlt, strlen(rlt), rb_utf8_encoding());
71
+ #else
72
+ return rb_str_new2(rlt);
73
+ #endif
74
+ }
75
+ }
Binary file
@@ -0,0 +1,5 @@
1
+ module UNF
2
+ class Normalizer
3
+ VERSION = "0.0.8.2.beta"
4
+ end
5
+ end
data/lib/unf_ext.rb ADDED
@@ -0,0 +1,5 @@
1
+ begin
2
+ require "#{RUBY_VERSION[/\A[0-9]+\.[0-9]+/]}/unf_ext.so"
3
+ rescue LoadError
4
+ require "unf_ext.so"
5
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'unf_ext'
16
+
17
+ class Test::Unit::TestCase
18
+ end