unf_ext 0.0.8.2.beta-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,150 @@
1
+ #ifndef UNF_TRIE_CHAR_STREAM_HH
2
+ #define UNF_TRIE_CHAR_STREAM_HH
3
+
4
+ #include <vector>
5
+ #include <string>
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class CharStream {
11
+ public:
12
+ CharStream(const char* str) : cur_(str) {}
13
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
14
+ unsigned char prev() const { return cur_[-1]; }
15
+ unsigned char peek() const { return *cur_; }
16
+ const char* cur() const { return cur_; }
17
+ bool eos() const { return *cur_ == '\0'; }
18
+ void setCur(const char* new_cur) { cur_ = new_cur; }
19
+
20
+ private:
21
+ const char* cur_;
22
+ };
23
+
24
+ class RangeCharStream {
25
+ public:
26
+ RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
27
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
28
+ unsigned char prev() const { return cur_[-1]; }
29
+ unsigned char peek() const { return *cur_; }
30
+ const char* cur() const { return cur_; }
31
+ const char* end() const { return end_; }
32
+ bool eos() const { return cur_ == end_; }
33
+
34
+ private:
35
+ const char* cur_;
36
+ const char* end_;
37
+ };
38
+
39
+ class CompoundCharStream {
40
+ public:
41
+ CompoundCharStream(const char* first, const char* second)
42
+ : beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
43
+
44
+ unsigned char read() { return !eos1() ? read1() : read2(); }
45
+ unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
46
+ unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
47
+
48
+ const char* cur() const { return !eos1() ? cur1 : cur2; }
49
+ bool eos() const { return eos1() && eos2(); }
50
+ bool within_first() const { return !eos1(); }
51
+
52
+ unsigned offset() const { return cur1-beg1 + cur2-beg2; }
53
+ void setCur(const char* p) {
54
+ if(beg1 <= p && p <= cur1) {
55
+ cur1=p;
56
+ cur2=beg2;
57
+ } else {
58
+ cur2=p;
59
+ }
60
+ }
61
+
62
+ protected:
63
+ unsigned char read1() { return eos1() ? '\0' : *cur1++; }
64
+ unsigned char read2() { return eos2() ? '\0' : *cur2++; }
65
+ bool eos1() const { return *cur1=='\0'; }
66
+ bool eos2() const { return *cur2=='\0'; }
67
+
68
+ protected:
69
+ const char* beg1;
70
+ const char* beg2;
71
+ const char* cur1;
72
+ const char* cur2;
73
+ };
74
+
75
+ class CharStreamForComposition : public CompoundCharStream {
76
+ public:
77
+ CharStreamForComposition (const char* first, const char* second,
78
+ const std::vector<unsigned char>& canonical_classes,
79
+ std::string& buf)
80
+ : CompoundCharStream(first, second), classes(canonical_classes), skipped(buf)
81
+ {}
82
+
83
+ void init_skipinfo() {
84
+ skipped.clear();
85
+ skipped_tail = 0;
86
+ }
87
+
88
+ void mark_as_last_valid_point() {
89
+ skipped_tail = skipped.size();
90
+ marked_point = cur();
91
+ }
92
+
93
+ void reset_at_marked_point() {
94
+ setCur(marked_point);
95
+ }
96
+
97
+ void append_read_char_to_str(std::string& s, const char* beg) const {
98
+ if(eos1()==false) {
99
+ s.append(beg, cur());
100
+ } else {
101
+ s.append(beg, cur1);
102
+ s.append(beg2, cur());
103
+ }
104
+ }
105
+
106
+ void append_skipped_chars_to_str(std::string& s) const {
107
+ s.append(skipped.begin(), skipped.begin()+skipped_tail);
108
+ }
109
+
110
+ unsigned char get_canonical_class() const {
111
+ return offset() < classes.size() ? classes[offset()] : 0;
112
+ }
113
+
114
+ bool next_combining_char(unsigned char prev_class, const char* ppp) {
115
+ while(Util::is_utf8_char_start_byte(peek()) == false)
116
+ read();
117
+
118
+ unsigned char mid_class = get_prev_canonical_class();
119
+ unsigned char cur_class = get_canonical_class();
120
+
121
+ if(prev_class==0 && mid_class==0 && cur_class!=0)
122
+ return false;
123
+
124
+ if(prev_class < cur_class && mid_class < cur_class) {
125
+ skipped.append(ppp, cur());
126
+ return true;
127
+ } else {
128
+ if(cur_class != 0) {
129
+ read();
130
+ return next_combining_char(prev_class,ppp);
131
+ }
132
+ return false;
133
+ }
134
+ }
135
+
136
+ private:
137
+ unsigned char get_prev_canonical_class() const {
138
+ return offset()-1 < classes.size() ? classes[offset()-1] : 0;
139
+ }
140
+
141
+ private:
142
+ const std::vector<unsigned char>& classes;
143
+ std::string& skipped;
144
+ unsigned skipped_tail;
145
+ const char* marked_point;
146
+ };
147
+ }
148
+ }
149
+
150
+ #endif
@@ -0,0 +1,25 @@
1
+ #ifndef UNF_TRIE_NODE_HH
2
+ #define UNF_TRIE_NODE_HH
3
+
4
+ namespace UNF {
5
+ namespace Trie {
6
+ class Node {
7
+ public:
8
+ unsigned jump(unsigned char ch) const { return base() + ch; }
9
+ unsigned value() const { return base(); }
10
+ unsigned check_char() const { return data>>24; }
11
+ unsigned to_uint() const { return data; }
12
+
13
+ static const Node* from_uint_array(const unsigned* node_uints)
14
+ { return reinterpret_cast<const Node*>(node_uints); }
15
+
16
+ private:
17
+ unsigned base() const { return data & 0xFFFFFF; }
18
+
19
+ private:
20
+ unsigned data;
21
+ };
22
+ }
23
+ }
24
+
25
+ #endif
@@ -0,0 +1,194 @@
1
+ #ifndef UNF_TRIE_SEARCHER_HH
2
+ #define UNF_TRIE_SEARCHER_HH
3
+
4
+ #include "char_stream.hh"
5
+ #include "node.hh"
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class Searcher {
11
+ public:
12
+ Searcher(const Node* nodes, unsigned root, const char* value=NULL)
13
+ : nodes(nodes), root(root), value(value) {}
14
+
15
+ unsigned find_value(const char* key, int default_value) const {
16
+ unsigned node_index=root;
17
+ for(CharStream in(key);; in.read()) {
18
+ node_index = nodes[node_index].jump(in.peek());
19
+ if(nodes[node_index].check_char()==in.peek()) {
20
+ unsigned terminal_index = nodes[node_index].jump('\0');
21
+ if(nodes[terminal_index].check_char()=='\0') {
22
+ return nodes[terminal_index].value();
23
+ }
24
+ } else
25
+ return default_value;
26
+ }
27
+ }
28
+
29
+ protected:
30
+ const Node* nodes;
31
+ const unsigned root;
32
+ const char* value;
33
+ };
34
+
35
+ class CanonicalCombiningClass : private Searcher {
36
+ public:
37
+ CanonicalCombiningClass(const unsigned* node_uints, unsigned root)
38
+ : Searcher(Node::from_uint_array(node_uints), root) {}
39
+
40
+ unsigned get_class(const char* str) const { return find_value(str,0); }
41
+
42
+ void sort(char* str, std::vector<unsigned char>& classes) const {
43
+ CharStream in(str);
44
+ unsigned sort_beg=0;
45
+ unsigned sort_end=0;
46
+ unsigned unicode_char_count=0;
47
+
48
+ loop_head:
49
+ unsigned beg = in.cur()-str;
50
+
51
+ for(unsigned node_index=root;;){
52
+ node_index = nodes[node_index].jump(in.read());
53
+
54
+ if(nodes[node_index].check_char()==in.prev()) {
55
+ unsigned terminal_index = nodes[node_index].jump('\0');
56
+ if(nodes[terminal_index].check_char()=='\0') {
57
+ if((unicode_char_count++)==0)
58
+ sort_beg = beg;
59
+ sort_end = in.cur()-str;
60
+
61
+ unsigned char klass = nodes[terminal_index].value();
62
+ for(unsigned i=beg; i < sort_end; i++)
63
+ classes[i] = klass;
64
+ break;
65
+ }
66
+ } else {
67
+ if(unicode_char_count > 1)
68
+ bubble_sort(str, classes, sort_beg, sort_end);
69
+ unicode_char_count = 0;
70
+ break;
71
+ }
72
+ }
73
+ Util::eat_until_utf8_char_start_point(in);
74
+
75
+ if(in.eos()==false)
76
+ goto loop_head;
77
+
78
+ if(unicode_char_count > 1)
79
+ bubble_sort(str, classes, sort_beg, sort_end);
80
+ }
81
+
82
+ private:
83
+ void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
84
+ for(unsigned limit=beg, next=end; limit != next;) {
85
+ limit = next;
86
+ for(unsigned i=beg+1; i < limit; i++)
87
+ if(canonical_classes[i-1] > canonical_classes[i]) {
88
+ std::swap(canonical_classes[i-1], canonical_classes[i]);
89
+ std::swap(str[i-1], str[i]);
90
+ next = i;
91
+ }
92
+ }
93
+ }
94
+ };
95
+
96
+ class NormalizationForm : private Searcher {
97
+ public:
98
+ NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
99
+ : Searcher(Node::from_uint_array(node_uints), root, value) {}
100
+
101
+ bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
102
+
103
+ void decompose(RangeCharStream in, std::string& buffer) const {
104
+ loop_head:
105
+ const char* beg = in.cur();
106
+
107
+ for(unsigned node_index=root;;) {
108
+ node_index = nodes[node_index].jump(in.read());
109
+ if(nodes[node_index].check_char()==in.prev()) {
110
+ unsigned terminal_index = nodes[node_index].jump('\0');
111
+ if(nodes[terminal_index].check_char()=='\0') {
112
+ word_append(buffer, value, nodes[terminal_index].value());
113
+ beg = in.cur();
114
+ break;
115
+ }
116
+ } else {
117
+ Util::eat_until_utf8_char_start_point(in);
118
+ buffer.append(beg, in.cur());
119
+ break;
120
+ }
121
+ }
122
+
123
+ if(in.eos()==false)
124
+ goto loop_head;
125
+ }
126
+
127
+ void compose(CharStreamForComposition& in, std::string& buf) const {
128
+ in.init_skipinfo();
129
+
130
+ const char* const beg = in.cur();
131
+ const char* current_char_head = in.cur();
132
+ unsigned composed_char_info = 0;
133
+
134
+ unsigned node_index = root;
135
+ unsigned retry_root_node = root;
136
+ unsigned char retry_root_class = 0;
137
+
138
+ for(bool first=true;;) {
139
+ if(Util::is_utf8_char_start_byte(in.peek())) {
140
+ if(node_index != root)
141
+ first=false;
142
+ current_char_head = in.cur();
143
+
144
+ retry_root_node = node_index;
145
+ retry_root_class = in.get_canonical_class();
146
+ }
147
+
148
+ retry:
149
+ unsigned next_index = nodes[node_index].jump(in.peek());
150
+ if(nodes[next_index].check_char()==in.read()) {
151
+ // succeeded
152
+ node_index = next_index;
153
+ unsigned terminal_index = nodes[node_index].jump('\0');
154
+ if(nodes[terminal_index].check_char()=='\0') {
155
+ composed_char_info = nodes[terminal_index].value();
156
+
157
+ in.mark_as_last_valid_point();
158
+ if(in.eos() || retry_root_class > in.get_canonical_class())
159
+ break;
160
+ }
161
+ } else if (first==true) {
162
+ // no retry if current point is a part of first starter
163
+ break;
164
+ } else if (in.next_combining_char(retry_root_class, current_char_head)==true) {
165
+ // back previous code-point and retry
166
+ node_index = retry_root_node;
167
+ current_char_head = in.cur();
168
+ goto retry;
169
+ } else {
170
+ break;
171
+ }
172
+ }
173
+
174
+ if(composed_char_info != 0) {
175
+ // append composed unicode-character and skipped combining-characters
176
+ word_append(buf, value, composed_char_info);
177
+ in.append_skipped_chars_to_str(buf);
178
+ in.reset_at_marked_point();
179
+ } else {
180
+ // append one unicode-character
181
+ in.setCur(Util::nearest_utf8_char_start_point(beg+1));
182
+ in.append_read_char_to_str(buf, beg);
183
+ }
184
+ }
185
+
186
+ private:
187
+ static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
188
+ buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
189
+ }
190
+ };
191
+ }
192
+ }
193
+
194
+ #endif
@@ -0,0 +1,24 @@
1
+ #ifndef UNF_UTIL_HH
2
+ #define UNF_UTIL_HH
3
+
4
+ namespace UNF {
5
+ namespace Util {
6
+ inline bool is_utf8_char_start_byte(char byte) {
7
+ if(!(byte&0x80)) return true; // ascii
8
+ else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
9
+ return false;
10
+ }
11
+
12
+ inline const char* nearest_utf8_char_start_point(const char* s) {
13
+ for(; is_utf8_char_start_byte(*s)==false; s++);
14
+ return s;
15
+ }
16
+
17
+ template <class CharStream>
18
+ inline void eat_until_utf8_char_start_point(CharStream& in) {
19
+ for(; is_utf8_char_start_byte(in.peek())==false; in.read());
20
+ }
21
+ }
22
+ }
23
+
24
+ #endif
@@ -0,0 +1,75 @@
1
+ #include "unf/normalizer.hh"
2
+
3
+ #include <ruby.h>
4
+ #if defined(HAVE_RUBY_ENCODING_H)
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ extern "C" {
9
+ VALUE unf_allocate(VALUE klass);
10
+ VALUE unf_initialize(VALUE self);
11
+ void unf_delete(UNF::Normalizer* ptr);
12
+ VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form);
13
+
14
+ ID FORM_NFD;
15
+ ID FORM_NFC;
16
+ ID FORM_NFKD;
17
+ ID FORM_NFKC;
18
+
19
+ void Init_unf_ext() {
20
+ VALUE mdl = rb_define_module("UNF");
21
+
22
+ VALUE cls = rb_define_class_under(mdl, "Normalizer", rb_cObject);
23
+ rb_define_alloc_func(cls, unf_allocate);
24
+ rb_define_method(cls, "initialize", (VALUE (*)(...))unf_initialize, 0);
25
+ rb_define_method(cls, "normalize", (VALUE (*)(...))unf_normalize, 2);
26
+
27
+ FORM_NFD = rb_intern("nfd");
28
+ FORM_NFC = rb_intern("nfc");
29
+ FORM_NFKD= rb_intern("nfkd");
30
+ FORM_NFKC= rb_intern("nfkc");
31
+ }
32
+
33
+
34
+ VALUE unf_allocate(VALUE klass) {
35
+ UNF::Normalizer* ptr;
36
+ VALUE obj = Data_Make_Struct(klass, UNF::Normalizer, NULL, unf_delete, ptr);
37
+ new ((void*)ptr) UNF::Normalizer;
38
+ return obj;
39
+ }
40
+
41
+ VALUE unf_initialize(VALUE self) {
42
+ return self;
43
+ }
44
+
45
+ void unf_delete(UNF::Normalizer* ptr) {
46
+ ptr->~Normalizer();
47
+ ruby_xfree(ptr);
48
+ }
49
+
50
+ VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form) {
51
+ UNF::Normalizer* ptr;
52
+ Data_Get_Struct(self, UNF::Normalizer, ptr);
53
+
54
+ const char* src = StringValueCStr(source);
55
+ const char* rlt;
56
+ ID form_id = SYM2ID(normalization_form);
57
+
58
+ if(form_id == FORM_NFD)
59
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFD);
60
+ else if(form_id == FORM_NFC)
61
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFC);
62
+ else if(form_id == FORM_NFKD)
63
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKD);
64
+ else if(form_id == FORM_NFKC)
65
+ rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKC);
66
+ else
67
+ rb_raise(rb_eArgError, "Specified Normalization-Form is unknown. Please select one from among :nfc, :nfd, :nfkc, :nfkd.");
68
+
69
+ #if defined(HAVE_RUBY_ENCODING_H)
70
+ return rb_enc_str_new(rlt, strlen(rlt), rb_utf8_encoding());
71
+ #else
72
+ return rb_str_new2(rlt);
73
+ #endif
74
+ }
75
+ }
Binary file
@@ -0,0 +1,5 @@
1
+ module UNF
2
+ class Normalizer
3
+ VERSION = "0.0.8.2.beta"
4
+ end
5
+ end
data/lib/unf_ext.rb ADDED
@@ -0,0 +1,5 @@
1
+ begin
2
+ require "#{RUBY_VERSION[/\A[0-9]+\.[0-9]+/]}/unf_ext.so"
3
+ rescue LoadError
4
+ require "unf_ext.so"
5
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'unf_ext'
16
+
17
+ class Test::Unit::TestCase
18
+ end