unf_ext 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,150 @@
1
+ #ifndef UNF_TRIE_CHAR_STREAM_HH
2
+ #define UNF_TRIE_CHAR_STREAM_HH
3
+
4
+ #include <vector>
5
+ #include <string>
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class CharStream {
11
+ public:
12
+ CharStream(const char* str) : cur_(str) {}
13
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
14
+ unsigned char prev() const { return cur_[-1]; }
15
+ unsigned char peek() const { return *cur_; }
16
+ const char* cur() const { return cur_; }
17
+ bool eos() const { return *cur_ == '\0'; }
18
+ void setCur(const char* new_cur) { cur_ = new_cur; }
19
+
20
+ private:
21
+ const char* cur_;
22
+ };
23
+
24
+ class RangeCharStream {
25
+ public:
26
+ RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
27
+ unsigned char read() { return eos() ? '\0' : *cur_++; }
28
+ unsigned char prev() const { return cur_[-1]; }
29
+ unsigned char peek() const { return *cur_; }
30
+ const char* cur() const { return cur_; }
31
+ const char* end() const { return end_; }
32
+ bool eos() const { return cur_ == end_; }
33
+
34
+ private:
35
+ const char* cur_;
36
+ const char* end_;
37
+ };
38
+
39
+ class CompoundCharStream {
40
+ public:
41
+ CompoundCharStream(const char* first, const char* second)
42
+ : beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
43
+
44
+ unsigned char read() { return !eos1() ? read1() : read2(); }
45
+ unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
46
+ unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
47
+
48
+ const char* cur() const { return !eos1() ? cur1 : cur2; }
49
+ bool eos() const { return eos1() && eos2(); }
50
+ bool within_first() const { return !eos1(); }
51
+
52
+ unsigned offset() const { return cur1-beg1 + cur2-beg2; }
53
+ void setCur(const char* p) {
54
+ if(beg1 <= p && p <= cur1) {
55
+ cur1=p;
56
+ cur2=beg2;
57
+ } else {
58
+ cur2=p;
59
+ }
60
+ }
61
+
62
+ protected:
63
+ unsigned char read1() { return eos1() ? '\0' : *cur1++; }
64
+ unsigned char read2() { return eos2() ? '\0' : *cur2++; }
65
+ bool eos1() const { return *cur1=='\0'; }
66
+ bool eos2() const { return *cur2=='\0'; }
67
+
68
+ protected:
69
+ const char* beg1;
70
+ const char* beg2;
71
+ const char* cur1;
72
+ const char* cur2;
73
+ };
74
+
75
+ class CharStreamForComposition : public CompoundCharStream {
76
+ public:
77
+ CharStreamForComposition (const char* first, const char* second,
78
+ const std::vector<unsigned char>& canonical_classes,
79
+ std::string& buf)
80
+ : CompoundCharStream(first, second), classes(canonical_classes), skipped(buf)
81
+ {}
82
+
83
+ void init_skipinfo() {
84
+ skipped.clear();
85
+ skipped_tail = 0;
86
+ }
87
+
88
+ void mark_as_last_valid_point() {
89
+ skipped_tail = skipped.size();
90
+ marked_point = cur();
91
+ }
92
+
93
+ void reset_at_marked_point() {
94
+ setCur(marked_point);
95
+ }
96
+
97
+ void append_read_char_to_str(std::string& s, const char* beg) const {
98
+ if(eos1()==false) {
99
+ s.append(beg, cur());
100
+ } else {
101
+ s.append(beg, cur1);
102
+ s.append(beg2, cur());
103
+ }
104
+ }
105
+
106
+ void append_skipped_chars_to_str(std::string& s) const {
107
+ s.append(skipped.begin(), skipped.begin()+skipped_tail);
108
+ }
109
+
110
+ unsigned char get_canonical_class() const {
111
+ return offset() < classes.size() ? classes[offset()] : 0;
112
+ }
113
+
114
+ bool next_combining_char(unsigned char prev_class, const char* ppp) {
115
+ while(Util::is_utf8_char_start_byte(peek()) == false)
116
+ read();
117
+
118
+ unsigned char mid_class = get_prev_canonical_class();
119
+ unsigned char cur_class = get_canonical_class();
120
+
121
+ if(prev_class==0 && mid_class==0 && cur_class!=0)
122
+ return false;
123
+
124
+ if(prev_class < cur_class && mid_class < cur_class) {
125
+ skipped.append(ppp, cur());
126
+ return true;
127
+ } else {
128
+ if(cur_class != 0) {
129
+ read();
130
+ return next_combining_char(prev_class,ppp);
131
+ }
132
+ return false;
133
+ }
134
+ }
135
+
136
+ private:
137
+ unsigned char get_prev_canonical_class() const {
138
+ return offset()-1 < classes.size() ? classes[offset()-1] : 0;
139
+ }
140
+
141
+ private:
142
+ const std::vector<unsigned char>& classes;
143
+ std::string& skipped;
144
+ unsigned skipped_tail;
145
+ const char* marked_point;
146
+ };
147
+ }
148
+ }
149
+
150
+ #endif
data/unf/trie/node.hh ADDED
@@ -0,0 +1,33 @@
1
+ #ifndef UNF_TRIE_NODE_HH
2
+ #define UNF_TRIE_NODE_HH
3
+
4
+ namespace UNF {
5
+ namespace Trie {
6
+ class Node {
7
+ public:
8
+ Node() : data(0xFFFFFFFF) {}
9
+
10
+ void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
11
+ void set_value(unsigned value) { set_base_index(value); }
12
+ void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
13
+
14
+ bool is_unused() const { return data==0xFFFFFFFF; }
15
+
16
+ unsigned jump(unsigned char ch) const { return base() + ch; }
17
+ unsigned value() const { return base(); }
18
+ unsigned check_char() const { return data>>24; }
19
+ unsigned to_uint() const { return data; }
20
+
21
+ static const Node* from_uint_array(const unsigned* node_uints)
22
+ { return reinterpret_cast<const Node*>(node_uints); }
23
+
24
+ private:
25
+ unsigned base() const { return data & 0xFFFFFF; }
26
+
27
+ private:
28
+ unsigned data;
29
+ };
30
+ }
31
+ }
32
+
33
+ #endif
@@ -0,0 +1,186 @@
1
+ #ifndef UNF_TRIE_SEARCHER_HH
2
+ #define UNF_TRIE_SEARCHER_HH
3
+
4
+ #include "char_stream.hh"
5
+ #include "node.hh"
6
+ #include "../util.hh"
7
+
8
+ namespace UNF {
9
+ namespace Trie {
10
+ class Searcher {
11
+ public:
12
+ Searcher(const Node* nodes, const char* value=NULL)
13
+ : nodes(nodes), value(value) {}
14
+
15
+ unsigned find_value(const char* key, int default_value) const {
16
+ unsigned node_index=0;
17
+ for(CharStream in(key);; in.read()) {
18
+ node_index = nodes[node_index].jump(in.peek());
19
+ if(nodes[node_index].check_char()==in.peek()) {
20
+ unsigned terminal_index = nodes[node_index].jump('\0');
21
+ if(nodes[terminal_index].check_char()=='\0')
22
+ return nodes[terminal_index].value();
23
+ } else
24
+ return default_value;
25
+ }
26
+ }
27
+
28
+ protected:
29
+ const Node* nodes;
30
+ const char* value;
31
+ };
32
+
33
+ class CanonicalCombiningClass : private Searcher {
34
+ public:
35
+ CanonicalCombiningClass(const unsigned* node_uints)
36
+ : Searcher(Node::from_uint_array(node_uints)) {}
37
+
38
+ unsigned get_class(const char* str) const { return find_value(str,0); }
39
+
40
+ void sort(char* str, std::vector<unsigned char>& classes) const {
41
+ CharStream in(str);
42
+ unsigned sort_beg=0;
43
+ unsigned sort_end=0;
44
+ unsigned unicode_char_count=0;
45
+
46
+ loop_head:
47
+ unsigned beg = in.cur()-str;
48
+
49
+ for(unsigned node_index=0;;){
50
+ node_index = nodes[node_index].jump(in.read());
51
+
52
+ if(nodes[node_index].check_char()==in.prev()) {
53
+ unsigned terminal_index = nodes[node_index].jump('\0');
54
+ if(nodes[terminal_index].check_char()=='\0') {
55
+ if((unicode_char_count++)==0)
56
+ sort_beg = beg;
57
+ sort_end = in.cur()-str;
58
+
59
+ unsigned char klass = nodes[terminal_index].value();
60
+ for(unsigned i=beg; i < sort_end; i++)
61
+ classes[i] = klass;
62
+ break;
63
+ }
64
+ } else {
65
+ if(unicode_char_count > 1)
66
+ bubble_sort(str, classes, sort_beg, sort_end);
67
+ unicode_char_count = 0;
68
+ break;
69
+ }
70
+ }
71
+ Util::eat_until_utf8_char_start_point(in);
72
+
73
+ if(in.eos()==false)
74
+ goto loop_head;
75
+
76
+ if(unicode_char_count > 1)
77
+ bubble_sort(str, classes, sort_beg, sort_end);
78
+ }
79
+
80
+ private:
81
+ void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
82
+ for(unsigned limit=beg, next=end; limit != next;) {
83
+ limit = next;
84
+ for(unsigned i=beg+1; i < limit; i++)
85
+ if(canonical_classes[i-1] > canonical_classes[i]) {
86
+ std::swap(canonical_classes[i-1], canonical_classes[i]);
87
+ std::swap(str[i-1], str[i]);
88
+ next = i;
89
+ }
90
+ }
91
+ }
92
+ };
93
+
94
+ class NormalizationForm : private Searcher {
95
+ public:
96
+ NormalizationForm(const unsigned* node_uints, const char* value=NULL)
97
+ : Searcher(Node::from_uint_array(node_uints), value) {}
98
+
99
+ bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
100
+
101
+ void decompose(RangeCharStream in, std::string& buffer) const {
102
+ loop_head:
103
+ const char* beg = in.cur();
104
+
105
+ for(unsigned node_index=0;;) {
106
+ node_index = nodes[node_index].jump(in.read());
107
+ if(nodes[node_index].check_char()==in.prev()) {
108
+ unsigned terminal_index = nodes[node_index].jump('\0');
109
+ if(nodes[terminal_index].check_char()=='\0') {
110
+ buffer.append(value+nodes[terminal_index].value());
111
+ beg = in.cur();
112
+ break;
113
+ }
114
+ } else {
115
+ Util::eat_until_utf8_char_start_point(in);
116
+ buffer.append(beg, in.cur());
117
+ break;
118
+ }
119
+ }
120
+
121
+ if(in.eos()==false)
122
+ goto loop_head;
123
+ }
124
+
125
+ void compose(CharStreamForComposition& in, std::string& buf) const {
126
+ in.init_skipinfo();
127
+
128
+ const char* const beg = in.cur();
129
+ const char* current_char_head = in.cur();
130
+ const char* composed_char = NULL;
131
+
132
+ unsigned node_index = 0;
133
+ unsigned retry_root_node = 0;
134
+ unsigned char retry_root_class = 0;
135
+
136
+ for(bool first=true;;) {
137
+ if(Util::is_utf8_char_start_byte(in.peek())) {
138
+ if(node_index != 0)
139
+ first=false;
140
+ current_char_head = in.cur();
141
+
142
+ retry_root_node = node_index;
143
+ retry_root_class = in.get_canonical_class();
144
+ }
145
+
146
+ retry:
147
+ unsigned next_index = nodes[node_index].jump(in.read());
148
+ if(nodes[next_index].check_char()==in.prev()) {
149
+ // succeeded
150
+ node_index = next_index;
151
+ unsigned terminal_index = nodes[node_index].jump('\0');
152
+ if(nodes[terminal_index].check_char()=='\0') {
153
+ composed_char = value+nodes[terminal_index].value();
154
+ in.mark_as_last_valid_point();
155
+ if(in.eos() || retry_root_class > in.get_canonical_class())
156
+ break;
157
+ }
158
+ } else if (first==true) {
159
+ // no retry if current point is a part of first starter
160
+ break;
161
+ } else if (in.next_combining_char(retry_root_class, current_char_head)==true) {
162
+ // back previous code-point and retry
163
+ node_index = retry_root_node;
164
+ current_char_head = in.cur();
165
+ goto retry;
166
+ } else {
167
+ break;
168
+ }
169
+ }
170
+
171
+ if(composed_char) {
172
+ // append composed unicode-character and skipped combining-characters
173
+ buf.append(composed_char);
174
+ in.append_skipped_chars_to_str(buf);
175
+ in.reset_at_marked_point();
176
+ } else {
177
+ // append one unicode-character
178
+ in.setCur(Util::nearest_utf8_char_start_point(beg+1));
179
+ in.append_read_char_to_str(buf, beg);
180
+ }
181
+ }
182
+ };
183
+ }
184
+ }
185
+
186
+ #endif
data/unf/util.hh ADDED
@@ -0,0 +1,24 @@
1
+ #ifndef UNF_UTIL_HH
2
+ #define UNF_UTIL_HH
3
+
4
+ namespace UNF {
5
+ namespace Util {
6
+ inline bool is_utf8_char_start_byte(char byte) {
7
+ if(!(byte&0x80)) return true; // ascii
8
+ else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
9
+ return false;
10
+ }
11
+
12
+ inline const char* nearest_utf8_char_start_point(const char* s) {
13
+ for(; is_utf8_char_start_byte(*s)==false; s++);
14
+ return s;
15
+ }
16
+
17
+ template <class CharStream>
18
+ inline void eat_until_utf8_char_start_point(CharStream& in) {
19
+ for(; is_utf8_char_start_byte(in.peek())==false; in.read());
20
+ }
21
+ }
22
+ }
23
+
24
+ #endif
data/unf_ext.gemspec ADDED
@@ -0,0 +1,67 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{unf_ext}
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = [%q{Takeru Ohta}, %q{Akinori MUSHA}]
12
+ s.date = %q{2011-10-24}
13
+ s.description = %q{Unicode Normalization Form support library for CRuby}
14
+ s.email = %q{knu@idaemons.org}
15
+ s.extensions = [%q{extconf.rb}]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.md"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ "Gemfile",
23
+ "LICENSE.txt",
24
+ "README.md",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "extconf.rb",
28
+ "test/helper.rb",
29
+ "test/normalization-test.txt",
30
+ "test/test_unf_ext.rb",
31
+ "unf.cc",
32
+ "unf/normalizer.hh",
33
+ "unf/table.hh",
34
+ "unf/trie/char_stream.hh",
35
+ "unf/trie/node.hh",
36
+ "unf/trie/searcher.hh",
37
+ "unf/util.hh",
38
+ "unf_ext.gemspec"
39
+ ]
40
+ s.homepage = %q{http://github.com/knu/ruby-unf_ext}
41
+ s.licenses = [%q{MIT}]
42
+ s.require_paths = [%q{lib}]
43
+ s.rubygems_version = %q{1.8.5}
44
+ s.summary = %q{Unicode Normalization Form support library for CRuby}
45
+
46
+ if s.respond_to? :specification_version then
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
51
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
52
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
53
+ s.add_development_dependency(%q<rcov>, [">= 0"])
54
+ else
55
+ s.add_dependency(%q<shoulda>, [">= 0"])
56
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
57
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
58
+ s.add_dependency(%q<rcov>, [">= 0"])
59
+ end
60
+ else
61
+ s.add_dependency(%q<shoulda>, [">= 0"])
62
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
63
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
64
+ s.add_dependency(%q<rcov>, [">= 0"])
65
+ end
66
+ end
67
+