unf_ext 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +1 -1
- data/Rakefile +12 -1
- data/VERSION +1 -1
- data/ext/unf_ext/extconf.rb +25 -0
- data/{unf → ext/unf_ext/unf}/normalizer.hh +6 -6
- data/ext/unf_ext/unf/table.hh +13542 -0
- data/{unf → ext/unf_ext/unf}/trie/char_stream.hh +0 -0
- data/{unf → ext/unf_ext/unf}/trie/node.hh +0 -8
- data/{unf → ext/unf_ext/unf}/trie/searcher.hh +29 -21
- data/{unf → ext/unf_ext/unf}/util.hh +0 -0
- data/{unf.cc → ext/unf_ext/unf.cc} +2 -2
- data/lib/unf_ext.rb +5 -0
- data/unf_ext.gemspec +15 -11
- metadata +30 -13
- data/extconf.rb +0 -4
- data/unf/table.hh +0 -19004
File without changes
|
@@ -5,14 +5,6 @@ namespace UNF {
|
|
5
5
|
namespace Trie {
|
6
6
|
class Node {
|
7
7
|
public:
|
8
|
-
Node() : data(0xFFFFFFFF) {}
|
9
|
-
|
10
|
-
void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
|
11
|
-
void set_value(unsigned value) { set_base_index(value); }
|
12
|
-
void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
|
13
|
-
|
14
|
-
bool is_unused() const { return data==0xFFFFFFFF; }
|
15
|
-
|
16
8
|
unsigned jump(unsigned char ch) const { return base() + ch; }
|
17
9
|
unsigned value() const { return base(); }
|
18
10
|
unsigned check_char() const { return data>>24; }
|
@@ -9,17 +9,18 @@ namespace UNF {
|
|
9
9
|
namespace Trie {
|
10
10
|
class Searcher {
|
11
11
|
public:
|
12
|
-
Searcher(const Node* nodes, const char* value=NULL)
|
13
|
-
: nodes(nodes), value(value) {}
|
12
|
+
Searcher(const Node* nodes, unsigned root, const char* value=NULL)
|
13
|
+
: nodes(nodes), root(root), value(value) {}
|
14
14
|
|
15
15
|
unsigned find_value(const char* key, int default_value) const {
|
16
|
-
unsigned node_index=
|
16
|
+
unsigned node_index=root;
|
17
17
|
for(CharStream in(key);; in.read()) {
|
18
18
|
node_index = nodes[node_index].jump(in.peek());
|
19
19
|
if(nodes[node_index].check_char()==in.peek()) {
|
20
|
-
unsigned terminal_index = nodes[node_index].jump('\0');
|
21
|
-
if(nodes[terminal_index].check_char()=='\0')
|
20
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
21
|
+
if(nodes[terminal_index].check_char()=='\0') {
|
22
22
|
return nodes[terminal_index].value();
|
23
|
+
}
|
23
24
|
} else
|
24
25
|
return default_value;
|
25
26
|
}
|
@@ -27,13 +28,14 @@ namespace UNF {
|
|
27
28
|
|
28
29
|
protected:
|
29
30
|
const Node* nodes;
|
31
|
+
const unsigned root;
|
30
32
|
const char* value;
|
31
33
|
};
|
32
34
|
|
33
35
|
class CanonicalCombiningClass : private Searcher {
|
34
36
|
public:
|
35
|
-
CanonicalCombiningClass(const unsigned* node_uints)
|
36
|
-
: Searcher(Node::from_uint_array(node_uints)) {}
|
37
|
+
CanonicalCombiningClass(const unsigned* node_uints, unsigned root)
|
38
|
+
: Searcher(Node::from_uint_array(node_uints), root) {}
|
37
39
|
|
38
40
|
unsigned get_class(const char* str) const { return find_value(str,0); }
|
39
41
|
|
@@ -46,7 +48,7 @@ namespace UNF {
|
|
46
48
|
loop_head:
|
47
49
|
unsigned beg = in.cur()-str;
|
48
50
|
|
49
|
-
for(unsigned node_index=
|
51
|
+
for(unsigned node_index=root;;){
|
50
52
|
node_index = nodes[node_index].jump(in.read());
|
51
53
|
|
52
54
|
if(nodes[node_index].check_char()==in.prev()) {
|
@@ -93,8 +95,8 @@ namespace UNF {
|
|
93
95
|
|
94
96
|
class NormalizationForm : private Searcher {
|
95
97
|
public:
|
96
|
-
NormalizationForm(const unsigned* node_uints, const char* value=NULL)
|
97
|
-
: Searcher(Node::from_uint_array(node_uints), value) {}
|
98
|
+
NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
|
99
|
+
: Searcher(Node::from_uint_array(node_uints), root, value) {}
|
98
100
|
|
99
101
|
bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
|
100
102
|
|
@@ -102,12 +104,12 @@ namespace UNF {
|
|
102
104
|
loop_head:
|
103
105
|
const char* beg = in.cur();
|
104
106
|
|
105
|
-
for(unsigned node_index=
|
107
|
+
for(unsigned node_index=root;;) {
|
106
108
|
node_index = nodes[node_index].jump(in.read());
|
107
109
|
if(nodes[node_index].check_char()==in.prev()) {
|
108
110
|
unsigned terminal_index = nodes[node_index].jump('\0');
|
109
111
|
if(nodes[terminal_index].check_char()=='\0') {
|
110
|
-
|
112
|
+
word_append(buffer, value, nodes[terminal_index].value());
|
111
113
|
beg = in.cur();
|
112
114
|
break;
|
113
115
|
}
|
@@ -127,15 +129,15 @@ namespace UNF {
|
|
127
129
|
|
128
130
|
const char* const beg = in.cur();
|
129
131
|
const char* current_char_head = in.cur();
|
130
|
-
|
132
|
+
unsigned composed_char_info = 0;
|
131
133
|
|
132
|
-
unsigned node_index =
|
133
|
-
unsigned retry_root_node =
|
134
|
+
unsigned node_index = root;
|
135
|
+
unsigned retry_root_node = root;
|
134
136
|
unsigned char retry_root_class = 0;
|
135
137
|
|
136
138
|
for(bool first=true;;) {
|
137
139
|
if(Util::is_utf8_char_start_byte(in.peek())) {
|
138
|
-
if(node_index !=
|
140
|
+
if(node_index != root)
|
139
141
|
first=false;
|
140
142
|
current_char_head = in.cur();
|
141
143
|
|
@@ -144,13 +146,14 @@ namespace UNF {
|
|
144
146
|
}
|
145
147
|
|
146
148
|
retry:
|
147
|
-
unsigned next_index = nodes[node_index].jump(in.
|
148
|
-
if(nodes[next_index].check_char()==in.
|
149
|
+
unsigned next_index = nodes[node_index].jump(in.peek());
|
150
|
+
if(nodes[next_index].check_char()==in.read()) {
|
149
151
|
// succeeded
|
150
152
|
node_index = next_index;
|
151
153
|
unsigned terminal_index = nodes[node_index].jump('\0');
|
152
154
|
if(nodes[terminal_index].check_char()=='\0') {
|
153
|
-
|
155
|
+
composed_char_info = nodes[terminal_index].value();
|
156
|
+
|
154
157
|
in.mark_as_last_valid_point();
|
155
158
|
if(in.eos() || retry_root_class > in.get_canonical_class())
|
156
159
|
break;
|
@@ -168,9 +171,9 @@ namespace UNF {
|
|
168
171
|
}
|
169
172
|
}
|
170
173
|
|
171
|
-
if(
|
174
|
+
if(composed_char_info != 0) {
|
172
175
|
// append composed unicode-character and skipped combining-characters
|
173
|
-
buf
|
176
|
+
word_append(buf, value, composed_char_info);
|
174
177
|
in.append_skipped_chars_to_str(buf);
|
175
178
|
in.reset_at_marked_point();
|
176
179
|
} else {
|
@@ -179,6 +182,11 @@ namespace UNF {
|
|
179
182
|
in.append_read_char_to_str(buf, beg);
|
180
183
|
}
|
181
184
|
}
|
185
|
+
|
186
|
+
private:
|
187
|
+
static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
|
188
|
+
buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
|
189
|
+
}
|
182
190
|
};
|
183
191
|
}
|
184
192
|
}
|
File without changes
|
data/lib/unf_ext.rb
ADDED
data/unf_ext.gemspec
CHANGED
@@ -5,14 +5,14 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{unf_ext}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Takeru Ohta}, %q{Akinori MUSHA}]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-12-08}
|
13
13
|
s.description = %q{Unicode Normalization Form support library for CRuby}
|
14
14
|
s.email = %q{knu@idaemons.org}
|
15
|
-
s.extensions = [%q{extconf.rb}]
|
15
|
+
s.extensions = [%q{ext/unf_ext/extconf.rb}]
|
16
16
|
s.extra_rdoc_files = [
|
17
17
|
"LICENSE.txt",
|
18
18
|
"README.md"
|
@@ -24,17 +24,18 @@ Gem::Specification.new do |s|
|
|
24
24
|
"README.md",
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
|
-
"extconf.rb",
|
27
|
+
"ext/unf_ext/extconf.rb",
|
28
|
+
"ext/unf_ext/unf.cc",
|
29
|
+
"ext/unf_ext/unf/normalizer.hh",
|
30
|
+
"ext/unf_ext/unf/table.hh",
|
31
|
+
"ext/unf_ext/unf/trie/char_stream.hh",
|
32
|
+
"ext/unf_ext/unf/trie/node.hh",
|
33
|
+
"ext/unf_ext/unf/trie/searcher.hh",
|
34
|
+
"ext/unf_ext/unf/util.hh",
|
35
|
+
"lib/unf_ext.rb",
|
28
36
|
"test/helper.rb",
|
29
37
|
"test/normalization-test.txt",
|
30
38
|
"test/test_unf_ext.rb",
|
31
|
-
"unf.cc",
|
32
|
-
"unf/normalizer.hh",
|
33
|
-
"unf/table.hh",
|
34
|
-
"unf/trie/char_stream.hh",
|
35
|
-
"unf/trie/node.hh",
|
36
|
-
"unf/trie/searcher.hh",
|
37
|
-
"unf/util.hh",
|
38
39
|
"unf_ext.gemspec"
|
39
40
|
]
|
40
41
|
s.homepage = %q{http://github.com/knu/ruby-unf_ext}
|
@@ -51,17 +52,20 @@ Gem::Specification.new do |s|
|
|
51
52
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
52
53
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
53
54
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
+
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
54
56
|
else
|
55
57
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
56
58
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
59
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
58
60
|
s.add_dependency(%q<rcov>, [">= 0"])
|
61
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
59
62
|
end
|
60
63
|
else
|
61
64
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
62
65
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
63
66
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
64
67
|
s.add_dependency(%q<rcov>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
65
69
|
end
|
66
70
|
end
|
67
71
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unf_ext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Takeru Ohta
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-12-08 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: shoulda
|
@@ -78,12 +78,28 @@ dependencies:
|
|
78
78
|
type: :development
|
79
79
|
requirement: *id004
|
80
80
|
prerelease: false
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: rake-compiler
|
83
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
hash: 17
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
- 7
|
92
|
+
- 9
|
93
|
+
version: 0.7.9
|
94
|
+
type: :development
|
95
|
+
requirement: *id005
|
96
|
+
prerelease: false
|
81
97
|
description: Unicode Normalization Form support library for CRuby
|
82
98
|
email: knu@idaemons.org
|
83
99
|
executables: []
|
84
100
|
|
85
101
|
extensions:
|
86
|
-
- extconf.rb
|
102
|
+
- ext/unf_ext/extconf.rb
|
87
103
|
extra_rdoc_files:
|
88
104
|
- LICENSE.txt
|
89
105
|
- README.md
|
@@ -94,17 +110,18 @@ files:
|
|
94
110
|
- README.md
|
95
111
|
- Rakefile
|
96
112
|
- VERSION
|
97
|
-
- extconf.rb
|
113
|
+
- ext/unf_ext/extconf.rb
|
114
|
+
- ext/unf_ext/unf.cc
|
115
|
+
- ext/unf_ext/unf/normalizer.hh
|
116
|
+
- ext/unf_ext/unf/table.hh
|
117
|
+
- ext/unf_ext/unf/trie/char_stream.hh
|
118
|
+
- ext/unf_ext/unf/trie/node.hh
|
119
|
+
- ext/unf_ext/unf/trie/searcher.hh
|
120
|
+
- ext/unf_ext/unf/util.hh
|
121
|
+
- lib/unf_ext.rb
|
98
122
|
- test/helper.rb
|
99
123
|
- test/normalization-test.txt
|
100
124
|
- test/test_unf_ext.rb
|
101
|
-
- unf.cc
|
102
|
-
- unf/normalizer.hh
|
103
|
-
- unf/table.hh
|
104
|
-
- unf/trie/char_stream.hh
|
105
|
-
- unf/trie/node.hh
|
106
|
-
- unf/trie/searcher.hh
|
107
|
-
- unf/util.hh
|
108
125
|
- unf_ext.gemspec
|
109
126
|
homepage: http://github.com/knu/ruby-unf_ext
|
110
127
|
licenses:
|
data/extconf.rb
DELETED