unf_ext 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ #ifndef UNF_TRIE_CHAR_STREAM_HH
2
+ #define UNF_TRIE_CHAR_STREAM_HH
3
+
4
+ #include <vector>
5
+ #include <string>
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class CharStream {
11
+ public:
12
+ CharStream(const char* str) : cur_(str) {}
13
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
14
+ unsigned char prev() const { return cur_[-1]; }
15
+ unsigned char peek() const { return *cur_; }
16
+ const char* cur() const { return cur_; }
17
+ bool eos() const { return *cur_ == '\0'; }
18
+ void setCur(const char* new_cur) { cur_ = new_cur; }
19
+
20
+ private:
21
+ const char* cur_;
22
+ };
23
+
24
+ class RangeCharStream {
25
+ public:
26
+ RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
27
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
28
+ unsigned char prev() const { return cur_[-1]; }
29
+ unsigned char peek() const { return *cur_; }
30
+ const char* cur() const { return cur_; }
31
+ const char* end() const { return end_; }
32
+ bool eos() const { return cur_ == end_; }
33
+
34
+ private:
35
+ const char* cur_;
36
+ const char* end_;
37
+ };
38
+
39
+ class CompoundCharStream {
40
+ public:
41
+ CompoundCharStream(const char* first, const char* second)
42
+ : beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
43
+
44
+ unsigned char read() { return !eos1() ? read1() : read2(); }
45
+ unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
46
+ unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
47
+
48
+ const char* cur() const { return !eos1() ? cur1 : cur2; }
49
+ bool eos() const { return eos1() && eos2(); }
50
+ bool within_first() const { return !eos1(); }
51
+
52
+ unsigned offset() const { return cur1-beg1 + cur2-beg2; }
53
+ void setCur(const char* p) {
54
+ if(beg1 <= p && p <= cur1) {
55
+ cur1=p;
56
+ cur2=beg2;
57
+ } else {
58
+ cur2=p;
59
+ }
60
+ }
61
+
62
+ protected:
63
+ unsigned char read1() { return eos1() ? '\0' : *cur1++; }
64
+ unsigned char read2() { return eos2() ? '\0' : *cur2++; }
65
+ bool eos1() const { return *cur1=='\0'; }
66
+ bool eos2() const { return *cur2=='\0'; }
67
+
68
+ protected:
69
+ const char* beg1;
70
+ const char* beg2;
71
+ const char* cur1;
72
+ const char* cur2;
73
+ };
74
+
75
+ class CharStreamForComposition : public CompoundCharStream {
76
+ public:
77
+ CharStreamForComposition (const char* first, const char* second,
78
+ const std::vector<unsigned char>& canonical_classes,
79
+ std::string& buf)
80
+ : CompoundCharStream(first, second), classes(canonical_classes), skipped(buf)
81
+ {}
82
+
83
+ void init_skipinfo() {
84
+ skipped.clear();
85
+ skipped_tail = 0;
86
+ }
87
+
88
+ void mark_as_last_valid_point() {
89
+ skipped_tail = skipped.size();
90
+ marked_point = cur();
91
+ }
92
+
93
+ void reset_at_marked_point() {
94
+ setCur(marked_point);
95
+ }
96
+
97
+ void append_read_char_to_str(std::string& s, const char* beg) const {
98
+ if(eos1()==false) {
99
+ s.append(beg, cur());
100
+ } else {
101
+ s.append(beg, cur1);
102
+ s.append(beg2, cur());
103
+ }
104
+ }
105
+
106
+ void append_skipped_chars_to_str(std::string& s) const {
107
+ s.append(skipped.begin(), skipped.begin()+skipped_tail);
108
+ }
109
+
110
+ unsigned char get_canonical_class() const {
111
+ return offset() < classes.size() ? classes[offset()] : 0;
112
+ }
113
+
114
+ bool next_combining_char(unsigned char prev_class, const char* ppp) {
115
+ while(Util::is_utf8_char_start_byte(peek()) == false)
116
+ read();
117
+
118
+ unsigned char mid_class = get_prev_canonical_class();
119
+ unsigned char cur_class = get_canonical_class();
120
+
121
+ if(prev_class==0 && mid_class==0 && cur_class!=0)
122
+ return false;
123
+
124
+ if(prev_class < cur_class && mid_class < cur_class) {
125
+ skipped.append(ppp, cur());
126
+ return true;
127
+ } else {
128
+ if(cur_class != 0) {
129
+ read();
130
+ return next_combining_char(prev_class,ppp);
131
+ }
132
+ return false;
133
+ }
134
+ }
135
+
136
+ private:
137
+ unsigned char get_prev_canonical_class() const {
138
+ return offset()-1 < classes.size() ? classes[offset()-1] : 0;
139
+ }
140
+
141
+ private:
142
+ const std::vector<unsigned char>& classes;
143
+ std::string& skipped;
144
+ unsigned skipped_tail;
145
+ const char* marked_point;
146
+ };
147
+ }
148
+ }
149
+
150
+ #endif
data/unf/trie/node.hh ADDED
@@ -0,0 +1,33 @@
1
+ #ifndef UNF_TRIE_NODE_HH
2
+ #define UNF_TRIE_NODE_HH
3
+
4
+ namespace UNF {
5
+ namespace Trie {
6
+ class Node {
7
+ public:
8
+ Node() : data(0xFFFFFFFF) {}
9
+
10
+ void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
11
+ void set_value(unsigned value) { set_base_index(value); }
12
+ void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
13
+
14
+ bool is_unused() const { return data==0xFFFFFFFF; }
15
+
16
+ unsigned jump(unsigned char ch) const { return base() + ch; }
17
+ unsigned value() const { return base(); }
18
+ unsigned check_char() const { return data>>24; }
19
+ unsigned to_uint() const { return data; }
20
+
21
+ static const Node* from_uint_array(const unsigned* node_uints)
22
+ { return reinterpret_cast<const Node*>(node_uints); }
23
+
24
+ private:
25
+ unsigned base() const { return data & 0xFFFFFF; }
26
+
27
+ private:
28
+ unsigned data;
29
+ };
30
+ }
31
+ }
32
+
33
+ #endif
@@ -0,0 +1,186 @@
1
+ #ifndef UNF_TRIE_SEARCHER_HH
2
+ #define UNF_TRIE_SEARCHER_HH
3
+
4
+ #include "char_stream.hh"
5
+ #include "node.hh"
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class Searcher {
11
+ public:
12
+ Searcher(const Node* nodes, const char* value=NULL)
13
+ : nodes(nodes), value(value) {}
14
+
15
+ unsigned find_value(const char* key, int default_value) const {
16
+ unsigned node_index=0;
17
+ for(CharStream in(key);; in.read()) {
18
+ node_index = nodes[node_index].jump(in.peek());
19
+ if(nodes[node_index].check_char()==in.peek()) {
20
+ unsigned terminal_index = nodes[node_index].jump('\0');
21
+ if(nodes[terminal_index].check_char()=='\0')
22
+ return nodes[terminal_index].value();
23
+ } else
24
+ return default_value;
25
+ }
26
+ }
27
+
28
+ protected:
29
+ const Node* nodes;
30
+ const char* value;
31
+ };
32
+
33
+ class CanonicalCombiningClass : private Searcher {
34
+ public:
35
+ CanonicalCombiningClass(const unsigned* node_uints)
36
+ : Searcher(Node::from_uint_array(node_uints)) {}
37
+
38
+ unsigned get_class(const char* str) const { return find_value(str,0); }
39
+
40
+ void sort(char* str, std::vector<unsigned char>& classes) const {
41
+ CharStream in(str);
42
+ unsigned sort_beg=0;
43
+ unsigned sort_end=0;
44
+ unsigned unicode_char_count=0;
45
+
46
+ loop_head:
47
+ unsigned beg = in.cur()-str;
48
+
49
+ for(unsigned node_index=0;;){
50
+ node_index = nodes[node_index].jump(in.read());
51
+
52
+ if(nodes[node_index].check_char()==in.prev()) {
53
+ unsigned terminal_index = nodes[node_index].jump('\0');
54
+ if(nodes[terminal_index].check_char()=='\0') {
55
+ if((unicode_char_count++)==0)
56
+ sort_beg = beg;
57
+ sort_end = in.cur()-str;
58
+
59
+ unsigned char klass = nodes[terminal_index].value();
60
+ for(unsigned i=beg; i < sort_end; i++)
61
+ classes[i] = klass;
62
+ break;
63
+ }
64
+ } else {
65
+ if(unicode_char_count > 1)
66
+ bubble_sort(str, classes, sort_beg, sort_end);
67
+ unicode_char_count = 0;
68
+ break;
69
+ }
70
+ }
71
+ Util::eat_until_utf8_char_start_point(in);
72
+
73
+ if(in.eos()==false)
74
+ goto loop_head;
75
+
76
+ if(unicode_char_count > 1)
77
+ bubble_sort(str, classes, sort_beg, sort_end);
78
+ }
79
+
80
+ private:
81
+ void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
82
+ for(unsigned limit=beg, next=end; limit != next;) {
83
+ limit = next;
84
+ for(unsigned i=beg+1; i < limit; i++)
85
+ if(canonical_classes[i-1] > canonical_classes[i]) {
86
+ std::swap(canonical_classes[i-1], canonical_classes[i]);
87
+ std::swap(str[i-1], str[i]);
88
+ next = i;
89
+ }
90
+ }
91
+ }
92
+ };
93
+
94
+ class NormalizationForm : private Searcher {
95
+ public:
96
+ NormalizationForm(const unsigned* node_uints, const char* value=NULL)
97
+ : Searcher(Node::from_uint_array(node_uints), value) {}
98
+
99
+ bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
100
+
101
+ void decompose(RangeCharStream in, std::string& buffer) const {
102
+ loop_head:
103
+ const char* beg = in.cur();
104
+
105
+ for(unsigned node_index=0;;) {
106
+ node_index = nodes[node_index].jump(in.read());
107
+ if(nodes[node_index].check_char()==in.prev()) {
108
+ unsigned terminal_index = nodes[node_index].jump('\0');
109
+ if(nodes[terminal_index].check_char()=='\0') {
110
+ buffer.append(value+nodes[terminal_index].value());
111
+ beg = in.cur();
112
+ break;
113
+ }
114
+ } else {
115
+ Util::eat_until_utf8_char_start_point(in);
116
+ buffer.append(beg, in.cur());
117
+ break;
118
+ }
119
+ }
120
+
121
+ if(in.eos()==false)
122
+ goto loop_head;
123
+ }
124
+
125
+ void compose(CharStreamForComposition& in, std::string& buf) const {
126
+ in.init_skipinfo();
127
+
128
+ const char* const beg = in.cur();
129
+ const char* current_char_head = in.cur();
130
+ const char* composed_char = NULL;
131
+
132
+ unsigned node_index = 0;
133
+ unsigned retry_root_node = 0;
134
+ unsigned char retry_root_class = 0;
135
+
136
+ for(bool first=true;;) {
137
+ if(Util::is_utf8_char_start_byte(in.peek())) {
138
+ if(node_index != 0)
139
+ first=false;
140
+ current_char_head = in.cur();
141
+
142
+ retry_root_node = node_index;
143
+ retry_root_class = in.get_canonical_class();
144
+ }
145
+
146
+ retry:
147
+ unsigned next_index = nodes[node_index].jump(in.read());
148
+ if(nodes[next_index].check_char()==in.prev()) {
149
+ // succeeded
150
+ node_index = next_index;
151
+ unsigned terminal_index = nodes[node_index].jump('\0');
152
+ if(nodes[terminal_index].check_char()=='\0') {
153
+ composed_char = value+nodes[terminal_index].value();
154
+ in.mark_as_last_valid_point();
155
+ if(in.eos() || retry_root_class > in.get_canonical_class())
156
+ break;
157
+ }
158
+ } else if (first==true) {
159
+ // no retry if current point is a part of first starter
160
+ break;
161
+ } else if (in.next_combining_char(retry_root_class, current_char_head)==true) {
162
+ // back previous code-point and retry
163
+ node_index = retry_root_node;
164
+ current_char_head = in.cur();
165
+ goto retry;
166
+ } else {
167
+ break;
168
+ }
169
+ }
170
+
171
+ if(composed_char) {
172
+ // append composed unicode-character and skipped combining-characters
173
+ buf.append(composed_char);
174
+ in.append_skipped_chars_to_str(buf);
175
+ in.reset_at_marked_point();
176
+ } else {
177
+ // append one unicode-character
178
+ in.setCur(Util::nearest_utf8_char_start_point(beg+1));
179
+ in.append_read_char_to_str(buf, beg);
180
+ }
181
+ }
182
+ };
183
+ }
184
+ }
185
+
186
+ #endif
data/unf/util.hh ADDED
@@ -0,0 +1,24 @@
1
+ #ifndef UNF_UTIL_HH
2
+ #define UNF_UTIL_HH
3
+
4
+ namespace UNF {
5
+ namespace Util {
6
+ inline bool is_utf8_char_start_byte(char byte) {
7
+ if(!(byte&0x80)) return true; // ascii
8
+ else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
9
+ return false;
10
+ }
11
+
12
+ inline const char* nearest_utf8_char_start_point(const char* s) {
13
+ for(; is_utf8_char_start_byte(*s)==false; s++);
14
+ return s;
15
+ }
16
+
17
+ template <class CharStream>
18
+ inline void eat_until_utf8_char_start_point(CharStream& in) {
19
+ for(; is_utf8_char_start_byte(in.peek())==false; in.read());
20
+ }
21
+ }
22
+ }
23
+
24
+ #endif
data/unf_ext.gemspec ADDED
@@ -0,0 +1,67 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{unf_ext}
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = [%q{Takeru Ohta}, %q{Akinori MUSHA}]
12
+ s.date = %q{2011-10-24}
13
+ s.description = %q{Unicode Normalization Form support library for CRuby}
14
+ s.email = %q{knu@idaemons.org}
15
+ s.extensions = [%q{extconf.rb}]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.md"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ "Gemfile",
23
+ "LICENSE.txt",
24
+ "README.md",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "extconf.rb",
28
+ "test/helper.rb",
29
+ "test/normalization-test.txt",
30
+ "test/test_unf_ext.rb",
31
+ "unf.cc",
32
+ "unf/normalizer.hh",
33
+ "unf/table.hh",
34
+ "unf/trie/char_stream.hh",
35
+ "unf/trie/node.hh",
36
+ "unf/trie/searcher.hh",
37
+ "unf/util.hh",
38
+ "unf_ext.gemspec"
39
+ ]
40
+ s.homepage = %q{http://github.com/knu/ruby-unf_ext}
41
+ s.licenses = [%q{MIT}]
42
+ s.require_paths = [%q{lib}]
43
+ s.rubygems_version = %q{1.8.5}
44
+ s.summary = %q{Unicode Normalization Form support library for CRuby}
45
+
46
+ if s.respond_to? :specification_version then
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
51
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
52
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
53
+ s.add_development_dependency(%q<rcov>, [">= 0"])
54
+ else
55
+ s.add_dependency(%q<shoulda>, [">= 0"])
56
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
57
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
58
+ s.add_dependency(%q<rcov>, [">= 0"])
59
+ end
60
+ else
61
+ s.add_dependency(%q<shoulda>, [">= 0"])
62
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
63
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
64
+ s.add_dependency(%q<rcov>, [">= 0"])
65
+ end
66
+ end
67
+