unf_ext 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +13 -0
- data/LICENSE.txt +22 -0
- data/README.md +58 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/extconf.rb +4 -0
- data/test/helper.rb +19 -0
- data/test/normalization-test.txt +108816 -0
- data/test/test_unf_ext.rb +41 -0
- data/unf.cc +75 -0
- data/unf/normalizer.hh +139 -0
- data/unf/table.hh +19004 -0
- data/unf/trie/char_stream.hh +150 -0
- data/unf/trie/node.hh +33 -0
- data/unf/trie/searcher.hh +186 -0
- data/unf/util.hh +24 -0
- data/unf_ext.gemspec +67 -0
- metadata +143 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
#ifndef UNF_TRIE_CHAR_STREAM_HH
|
2
|
+
#define UNF_TRIE_CHAR_STREAM_HH
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
#include <string>
|
6
|
+
#include "../util.hh"
|
7
|
+
|
8
|
+
namespace UNF {
|
9
|
+
namespace Trie {
|
10
|
+
class CharStream {
|
11
|
+
public:
|
12
|
+
CharStream(const char* str) : cur_(str) {}
|
13
|
+
unsigned char read() { return eos() ? '\0' : *cur_++; }
|
14
|
+
unsigned char prev() const { return cur_[-1]; }
|
15
|
+
unsigned char peek() const { return *cur_; }
|
16
|
+
const char* cur() const { return cur_; }
|
17
|
+
bool eos() const { return *cur_ == '\0'; }
|
18
|
+
void setCur(const char* new_cur) { cur_ = new_cur; }
|
19
|
+
|
20
|
+
private:
|
21
|
+
const char* cur_;
|
22
|
+
};
|
23
|
+
|
24
|
+
class RangeCharStream {
|
25
|
+
public:
|
26
|
+
RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
|
27
|
+
unsigned char read() { return eos() ? '\0' : *cur_++; }
|
28
|
+
unsigned char prev() const { return cur_[-1]; }
|
29
|
+
unsigned char peek() const { return *cur_; }
|
30
|
+
const char* cur() const { return cur_; }
|
31
|
+
const char* end() const { return end_; }
|
32
|
+
bool eos() const { return cur_ == end_; }
|
33
|
+
|
34
|
+
private:
|
35
|
+
const char* cur_;
|
36
|
+
const char* end_;
|
37
|
+
};
|
38
|
+
|
39
|
+
class CompoundCharStream {
|
40
|
+
public:
|
41
|
+
CompoundCharStream(const char* first, const char* second)
|
42
|
+
: beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
|
43
|
+
|
44
|
+
unsigned char read() { return !eos1() ? read1() : read2(); }
|
45
|
+
unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
|
46
|
+
unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
|
47
|
+
|
48
|
+
const char* cur() const { return !eos1() ? cur1 : cur2; }
|
49
|
+
bool eos() const { return eos1() && eos2(); }
|
50
|
+
bool within_first() const { return !eos1(); }
|
51
|
+
|
52
|
+
unsigned offset() const { return cur1-beg1 + cur2-beg2; }
|
53
|
+
void setCur(const char* p) {
|
54
|
+
if(beg1 <= p && p <= cur1) {
|
55
|
+
cur1=p;
|
56
|
+
cur2=beg2;
|
57
|
+
} else {
|
58
|
+
cur2=p;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
protected:
|
63
|
+
unsigned char read1() { return eos1() ? '\0' : *cur1++; }
|
64
|
+
unsigned char read2() { return eos2() ? '\0' : *cur2++; }
|
65
|
+
bool eos1() const { return *cur1=='\0'; }
|
66
|
+
bool eos2() const { return *cur2=='\0'; }
|
67
|
+
|
68
|
+
protected:
|
69
|
+
const char* beg1;
|
70
|
+
const char* beg2;
|
71
|
+
const char* cur1;
|
72
|
+
const char* cur2;
|
73
|
+
};
|
74
|
+
|
75
|
+
class CharStreamForComposition : public CompoundCharStream {
|
76
|
+
public:
|
77
|
+
CharStreamForComposition (const char* first, const char* second,
|
78
|
+
const std::vector<unsigned char>& canonical_classes,
|
79
|
+
std::string& buf)
|
80
|
+
: CompoundCharStream(first, second), classes(canonical_classes), skipped(buf)
|
81
|
+
{}
|
82
|
+
|
83
|
+
void init_skipinfo() {
|
84
|
+
skipped.clear();
|
85
|
+
skipped_tail = 0;
|
86
|
+
}
|
87
|
+
|
88
|
+
void mark_as_last_valid_point() {
|
89
|
+
skipped_tail = skipped.size();
|
90
|
+
marked_point = cur();
|
91
|
+
}
|
92
|
+
|
93
|
+
void reset_at_marked_point() {
|
94
|
+
setCur(marked_point);
|
95
|
+
}
|
96
|
+
|
97
|
+
void append_read_char_to_str(std::string& s, const char* beg) const {
|
98
|
+
if(eos1()==false) {
|
99
|
+
s.append(beg, cur());
|
100
|
+
} else {
|
101
|
+
s.append(beg, cur1);
|
102
|
+
s.append(beg2, cur());
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
void append_skipped_chars_to_str(std::string& s) const {
|
107
|
+
s.append(skipped.begin(), skipped.begin()+skipped_tail);
|
108
|
+
}
|
109
|
+
|
110
|
+
unsigned char get_canonical_class() const {
|
111
|
+
return offset() < classes.size() ? classes[offset()] : 0;
|
112
|
+
}
|
113
|
+
|
114
|
+
bool next_combining_char(unsigned char prev_class, const char* ppp) {
|
115
|
+
while(Util::is_utf8_char_start_byte(peek()) == false)
|
116
|
+
read();
|
117
|
+
|
118
|
+
unsigned char mid_class = get_prev_canonical_class();
|
119
|
+
unsigned char cur_class = get_canonical_class();
|
120
|
+
|
121
|
+
if(prev_class==0 && mid_class==0 && cur_class!=0)
|
122
|
+
return false;
|
123
|
+
|
124
|
+
if(prev_class < cur_class && mid_class < cur_class) {
|
125
|
+
skipped.append(ppp, cur());
|
126
|
+
return true;
|
127
|
+
} else {
|
128
|
+
if(cur_class != 0) {
|
129
|
+
read();
|
130
|
+
return next_combining_char(prev_class,ppp);
|
131
|
+
}
|
132
|
+
return false;
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
private:
|
137
|
+
unsigned char get_prev_canonical_class() const {
|
138
|
+
return offset()-1 < classes.size() ? classes[offset()-1] : 0;
|
139
|
+
}
|
140
|
+
|
141
|
+
private:
|
142
|
+
const std::vector<unsigned char>& classes;
|
143
|
+
std::string& skipped;
|
144
|
+
unsigned skipped_tail;
|
145
|
+
const char* marked_point;
|
146
|
+
};
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
#endif
|
data/unf/trie/node.hh
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#ifndef UNF_TRIE_NODE_HH
|
2
|
+
#define UNF_TRIE_NODE_HH
|
3
|
+
|
4
|
+
namespace UNF {
|
5
|
+
namespace Trie {
|
6
|
+
class Node {
|
7
|
+
public:
|
8
|
+
Node() : data(0xFFFFFFFF) {}
|
9
|
+
|
10
|
+
void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
|
11
|
+
void set_value(unsigned value) { set_base_index(value); }
|
12
|
+
void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
|
13
|
+
|
14
|
+
bool is_unused() const { return data==0xFFFFFFFF; }
|
15
|
+
|
16
|
+
unsigned jump(unsigned char ch) const { return base() + ch; }
|
17
|
+
unsigned value() const { return base(); }
|
18
|
+
unsigned check_char() const { return data>>24; }
|
19
|
+
unsigned to_uint() const { return data; }
|
20
|
+
|
21
|
+
static const Node* from_uint_array(const unsigned* node_uints)
|
22
|
+
{ return reinterpret_cast<const Node*>(node_uints); }
|
23
|
+
|
24
|
+
private:
|
25
|
+
unsigned base() const { return data & 0xFFFFFF; }
|
26
|
+
|
27
|
+
private:
|
28
|
+
unsigned data;
|
29
|
+
};
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
#endif
|
@@ -0,0 +1,186 @@
|
|
1
|
+
#ifndef UNF_TRIE_SEARCHER_HH
|
2
|
+
#define UNF_TRIE_SEARCHER_HH
|
3
|
+
|
4
|
+
#include "char_stream.hh"
|
5
|
+
#include "node.hh"
|
6
|
+
#include "../util.hh"
|
7
|
+
|
8
|
+
namespace UNF {
|
9
|
+
namespace Trie {
|
10
|
+
class Searcher {
|
11
|
+
public:
|
12
|
+
Searcher(const Node* nodes, const char* value=NULL)
|
13
|
+
: nodes(nodes), value(value) {}
|
14
|
+
|
15
|
+
unsigned find_value(const char* key, int default_value) const {
|
16
|
+
unsigned node_index=0;
|
17
|
+
for(CharStream in(key);; in.read()) {
|
18
|
+
node_index = nodes[node_index].jump(in.peek());
|
19
|
+
if(nodes[node_index].check_char()==in.peek()) {
|
20
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
21
|
+
if(nodes[terminal_index].check_char()=='\0')
|
22
|
+
return nodes[terminal_index].value();
|
23
|
+
} else
|
24
|
+
return default_value;
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
protected:
|
29
|
+
const Node* nodes;
|
30
|
+
const char* value;
|
31
|
+
};
|
32
|
+
|
33
|
+
class CanonicalCombiningClass : private Searcher {
|
34
|
+
public:
|
35
|
+
CanonicalCombiningClass(const unsigned* node_uints)
|
36
|
+
: Searcher(Node::from_uint_array(node_uints)) {}
|
37
|
+
|
38
|
+
unsigned get_class(const char* str) const { return find_value(str,0); }
|
39
|
+
|
40
|
+
void sort(char* str, std::vector<unsigned char>& classes) const {
|
41
|
+
CharStream in(str);
|
42
|
+
unsigned sort_beg=0;
|
43
|
+
unsigned sort_end=0;
|
44
|
+
unsigned unicode_char_count=0;
|
45
|
+
|
46
|
+
loop_head:
|
47
|
+
unsigned beg = in.cur()-str;
|
48
|
+
|
49
|
+
for(unsigned node_index=0;;){
|
50
|
+
node_index = nodes[node_index].jump(in.read());
|
51
|
+
|
52
|
+
if(nodes[node_index].check_char()==in.prev()) {
|
53
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
54
|
+
if(nodes[terminal_index].check_char()=='\0') {
|
55
|
+
if((unicode_char_count++)==0)
|
56
|
+
sort_beg = beg;
|
57
|
+
sort_end = in.cur()-str;
|
58
|
+
|
59
|
+
unsigned char klass = nodes[terminal_index].value();
|
60
|
+
for(unsigned i=beg; i < sort_end; i++)
|
61
|
+
classes[i] = klass;
|
62
|
+
break;
|
63
|
+
}
|
64
|
+
} else {
|
65
|
+
if(unicode_char_count > 1)
|
66
|
+
bubble_sort(str, classes, sort_beg, sort_end);
|
67
|
+
unicode_char_count = 0;
|
68
|
+
break;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
Util::eat_until_utf8_char_start_point(in);
|
72
|
+
|
73
|
+
if(in.eos()==false)
|
74
|
+
goto loop_head;
|
75
|
+
|
76
|
+
if(unicode_char_count > 1)
|
77
|
+
bubble_sort(str, classes, sort_beg, sort_end);
|
78
|
+
}
|
79
|
+
|
80
|
+
private:
|
81
|
+
void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
|
82
|
+
for(unsigned limit=beg, next=end; limit != next;) {
|
83
|
+
limit = next;
|
84
|
+
for(unsigned i=beg+1; i < limit; i++)
|
85
|
+
if(canonical_classes[i-1] > canonical_classes[i]) {
|
86
|
+
std::swap(canonical_classes[i-1], canonical_classes[i]);
|
87
|
+
std::swap(str[i-1], str[i]);
|
88
|
+
next = i;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
}
|
92
|
+
};
|
93
|
+
|
94
|
+
class NormalizationForm : private Searcher {
|
95
|
+
public:
|
96
|
+
NormalizationForm(const unsigned* node_uints, const char* value=NULL)
|
97
|
+
: Searcher(Node::from_uint_array(node_uints), value) {}
|
98
|
+
|
99
|
+
bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
|
100
|
+
|
101
|
+
void decompose(RangeCharStream in, std::string& buffer) const {
|
102
|
+
loop_head:
|
103
|
+
const char* beg = in.cur();
|
104
|
+
|
105
|
+
for(unsigned node_index=0;;) {
|
106
|
+
node_index = nodes[node_index].jump(in.read());
|
107
|
+
if(nodes[node_index].check_char()==in.prev()) {
|
108
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
109
|
+
if(nodes[terminal_index].check_char()=='\0') {
|
110
|
+
buffer.append(value+nodes[terminal_index].value());
|
111
|
+
beg = in.cur();
|
112
|
+
break;
|
113
|
+
}
|
114
|
+
} else {
|
115
|
+
Util::eat_until_utf8_char_start_point(in);
|
116
|
+
buffer.append(beg, in.cur());
|
117
|
+
break;
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
if(in.eos()==false)
|
122
|
+
goto loop_head;
|
123
|
+
}
|
124
|
+
|
125
|
+
void compose(CharStreamForComposition& in, std::string& buf) const {
|
126
|
+
in.init_skipinfo();
|
127
|
+
|
128
|
+
const char* const beg = in.cur();
|
129
|
+
const char* current_char_head = in.cur();
|
130
|
+
const char* composed_char = NULL;
|
131
|
+
|
132
|
+
unsigned node_index = 0;
|
133
|
+
unsigned retry_root_node = 0;
|
134
|
+
unsigned char retry_root_class = 0;
|
135
|
+
|
136
|
+
for(bool first=true;;) {
|
137
|
+
if(Util::is_utf8_char_start_byte(in.peek())) {
|
138
|
+
if(node_index != 0)
|
139
|
+
first=false;
|
140
|
+
current_char_head = in.cur();
|
141
|
+
|
142
|
+
retry_root_node = node_index;
|
143
|
+
retry_root_class = in.get_canonical_class();
|
144
|
+
}
|
145
|
+
|
146
|
+
retry:
|
147
|
+
unsigned next_index = nodes[node_index].jump(in.read());
|
148
|
+
if(nodes[next_index].check_char()==in.prev()) {
|
149
|
+
// succeeded
|
150
|
+
node_index = next_index;
|
151
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
152
|
+
if(nodes[terminal_index].check_char()=='\0') {
|
153
|
+
composed_char = value+nodes[terminal_index].value();
|
154
|
+
in.mark_as_last_valid_point();
|
155
|
+
if(in.eos() || retry_root_class > in.get_canonical_class())
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
} else if (first==true) {
|
159
|
+
// no retry if current point is a part of first starter
|
160
|
+
break;
|
161
|
+
} else if (in.next_combining_char(retry_root_class, current_char_head)==true) {
|
162
|
+
// back previous code-point and retry
|
163
|
+
node_index = retry_root_node;
|
164
|
+
current_char_head = in.cur();
|
165
|
+
goto retry;
|
166
|
+
} else {
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
if(composed_char) {
|
172
|
+
// append composed unicode-character and skipped combining-characters
|
173
|
+
buf.append(composed_char);
|
174
|
+
in.append_skipped_chars_to_str(buf);
|
175
|
+
in.reset_at_marked_point();
|
176
|
+
} else {
|
177
|
+
// append one unicode-character
|
178
|
+
in.setCur(Util::nearest_utf8_char_start_point(beg+1));
|
179
|
+
in.append_read_char_to_str(buf, beg);
|
180
|
+
}
|
181
|
+
}
|
182
|
+
};
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
#endif
|
data/unf/util.hh
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef UNF_UTIL_HH
|
2
|
+
#define UNF_UTIL_HH
|
3
|
+
|
4
|
+
namespace UNF {
|
5
|
+
namespace Util {
|
6
|
+
inline bool is_utf8_char_start_byte(char byte) {
|
7
|
+
if(!(byte&0x80)) return true; // ascii
|
8
|
+
else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
|
9
|
+
return false;
|
10
|
+
}
|
11
|
+
|
12
|
+
inline const char* nearest_utf8_char_start_point(const char* s) {
|
13
|
+
for(; is_utf8_char_start_byte(*s)==false; s++);
|
14
|
+
return s;
|
15
|
+
}
|
16
|
+
|
17
|
+
template <class CharStream>
|
18
|
+
inline void eat_until_utf8_char_start_point(CharStream& in) {
|
19
|
+
for(; is_utf8_char_start_byte(in.peek())==false; in.read());
|
20
|
+
}
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
#endif
|
data/unf_ext.gemspec
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{unf_ext}
|
8
|
+
s.version = "0.0.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = [%q{Takeru Ohta}, %q{Akinori MUSHA}]
|
12
|
+
s.date = %q{2011-10-24}
|
13
|
+
s.description = %q{Unicode Normalization Form support library for CRuby}
|
14
|
+
s.email = %q{knu@idaemons.org}
|
15
|
+
s.extensions = [%q{extconf.rb}]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
"Gemfile",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.md",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"extconf.rb",
|
28
|
+
"test/helper.rb",
|
29
|
+
"test/normalization-test.txt",
|
30
|
+
"test/test_unf_ext.rb",
|
31
|
+
"unf.cc",
|
32
|
+
"unf/normalizer.hh",
|
33
|
+
"unf/table.hh",
|
34
|
+
"unf/trie/char_stream.hh",
|
35
|
+
"unf/trie/node.hh",
|
36
|
+
"unf/trie/searcher.hh",
|
37
|
+
"unf/util.hh",
|
38
|
+
"unf_ext.gemspec"
|
39
|
+
]
|
40
|
+
s.homepage = %q{http://github.com/knu/ruby-unf_ext}
|
41
|
+
s.licenses = [%q{MIT}]
|
42
|
+
s.require_paths = [%q{lib}]
|
43
|
+
s.rubygems_version = %q{1.8.5}
|
44
|
+
s.summary = %q{Unicode Normalization Form support library for CRuby}
|
45
|
+
|
46
|
+
if s.respond_to? :specification_version then
|
47
|
+
s.specification_version = 3
|
48
|
+
|
49
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
50
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
51
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
52
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
53
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
54
|
+
else
|
55
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
56
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
58
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
62
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
63
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
64
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|