unf_ext 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/README.md +1 -1
- data/Rakefile +12 -1
- data/VERSION +1 -1
- data/ext/unf_ext/extconf.rb +25 -0
- data/{unf → ext/unf_ext/unf}/normalizer.hh +6 -6
- data/ext/unf_ext/unf/table.hh +13542 -0
- data/{unf → ext/unf_ext/unf}/trie/char_stream.hh +0 -0
- data/{unf → ext/unf_ext/unf}/trie/node.hh +0 -8
- data/{unf → ext/unf_ext/unf}/trie/searcher.hh +29 -21
- data/{unf → ext/unf_ext/unf}/util.hh +0 -0
- data/{unf.cc → ext/unf_ext/unf.cc} +2 -2
- data/lib/unf_ext.rb +5 -0
- data/unf_ext.gemspec +15 -11
- metadata +30 -13
- data/extconf.rb +0 -4
- data/unf/table.hh +0 -19004
File without changes
|
@@ -5,14 +5,6 @@ namespace UNF {
|
|
5
5
|
namespace Trie {
|
6
6
|
class Node {
|
7
7
|
public:
|
8
|
-
Node() : data(0xFFFFFFFF) {}
|
9
|
-
|
10
|
-
void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
|
11
|
-
void set_value(unsigned value) { set_base_index(value); }
|
12
|
-
void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
|
13
|
-
|
14
|
-
bool is_unused() const { return data==0xFFFFFFFF; }
|
15
|
-
|
16
8
|
unsigned jump(unsigned char ch) const { return base() + ch; }
|
17
9
|
unsigned value() const { return base(); }
|
18
10
|
unsigned check_char() const { return data>>24; }
|
@@ -9,17 +9,18 @@ namespace UNF {
|
|
9
9
|
namespace Trie {
|
10
10
|
class Searcher {
|
11
11
|
public:
|
12
|
-
Searcher(const Node* nodes, const char* value=NULL)
|
13
|
-
: nodes(nodes), value(value) {}
|
12
|
+
Searcher(const Node* nodes, unsigned root, const char* value=NULL)
|
13
|
+
: nodes(nodes), root(root), value(value) {}
|
14
14
|
|
15
15
|
unsigned find_value(const char* key, int default_value) const {
|
16
|
-
unsigned node_index=
|
16
|
+
unsigned node_index=root;
|
17
17
|
for(CharStream in(key);; in.read()) {
|
18
18
|
node_index = nodes[node_index].jump(in.peek());
|
19
19
|
if(nodes[node_index].check_char()==in.peek()) {
|
20
|
-
unsigned terminal_index = nodes[node_index].jump('\0');
|
21
|
-
if(nodes[terminal_index].check_char()=='\0')
|
20
|
+
unsigned terminal_index = nodes[node_index].jump('\0');
|
21
|
+
if(nodes[terminal_index].check_char()=='\0') {
|
22
22
|
return nodes[terminal_index].value();
|
23
|
+
}
|
23
24
|
} else
|
24
25
|
return default_value;
|
25
26
|
}
|
@@ -27,13 +28,14 @@ namespace UNF {
|
|
27
28
|
|
28
29
|
protected:
|
29
30
|
const Node* nodes;
|
31
|
+
const unsigned root;
|
30
32
|
const char* value;
|
31
33
|
};
|
32
34
|
|
33
35
|
class CanonicalCombiningClass : private Searcher {
|
34
36
|
public:
|
35
|
-
CanonicalCombiningClass(const unsigned* node_uints)
|
36
|
-
: Searcher(Node::from_uint_array(node_uints)) {}
|
37
|
+
CanonicalCombiningClass(const unsigned* node_uints, unsigned root)
|
38
|
+
: Searcher(Node::from_uint_array(node_uints), root) {}
|
37
39
|
|
38
40
|
unsigned get_class(const char* str) const { return find_value(str,0); }
|
39
41
|
|
@@ -46,7 +48,7 @@ namespace UNF {
|
|
46
48
|
loop_head:
|
47
49
|
unsigned beg = in.cur()-str;
|
48
50
|
|
49
|
-
for(unsigned node_index=
|
51
|
+
for(unsigned node_index=root;;){
|
50
52
|
node_index = nodes[node_index].jump(in.read());
|
51
53
|
|
52
54
|
if(nodes[node_index].check_char()==in.prev()) {
|
@@ -93,8 +95,8 @@ namespace UNF {
|
|
93
95
|
|
94
96
|
class NormalizationForm : private Searcher {
|
95
97
|
public:
|
96
|
-
NormalizationForm(const unsigned* node_uints, const char* value=NULL)
|
97
|
-
: Searcher(Node::from_uint_array(node_uints), value) {}
|
98
|
+
NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
|
99
|
+
: Searcher(Node::from_uint_array(node_uints), root, value) {}
|
98
100
|
|
99
101
|
bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
|
100
102
|
|
@@ -102,12 +104,12 @@ namespace UNF {
|
|
102
104
|
loop_head:
|
103
105
|
const char* beg = in.cur();
|
104
106
|
|
105
|
-
for(unsigned node_index=
|
107
|
+
for(unsigned node_index=root;;) {
|
106
108
|
node_index = nodes[node_index].jump(in.read());
|
107
109
|
if(nodes[node_index].check_char()==in.prev()) {
|
108
110
|
unsigned terminal_index = nodes[node_index].jump('\0');
|
109
111
|
if(nodes[terminal_index].check_char()=='\0') {
|
110
|
-
|
112
|
+
word_append(buffer, value, nodes[terminal_index].value());
|
111
113
|
beg = in.cur();
|
112
114
|
break;
|
113
115
|
}
|
@@ -127,15 +129,15 @@ namespace UNF {
|
|
127
129
|
|
128
130
|
const char* const beg = in.cur();
|
129
131
|
const char* current_char_head = in.cur();
|
130
|
-
|
132
|
+
unsigned composed_char_info = 0;
|
131
133
|
|
132
|
-
unsigned node_index =
|
133
|
-
unsigned retry_root_node =
|
134
|
+
unsigned node_index = root;
|
135
|
+
unsigned retry_root_node = root;
|
134
136
|
unsigned char retry_root_class = 0;
|
135
137
|
|
136
138
|
for(bool first=true;;) {
|
137
139
|
if(Util::is_utf8_char_start_byte(in.peek())) {
|
138
|
-
if(node_index !=
|
140
|
+
if(node_index != root)
|
139
141
|
first=false;
|
140
142
|
current_char_head = in.cur();
|
141
143
|
|
@@ -144,13 +146,14 @@ namespace UNF {
|
|
144
146
|
}
|
145
147
|
|
146
148
|
retry:
|
147
|
-
unsigned next_index = nodes[node_index].jump(in.
|
148
|
-
if(nodes[next_index].check_char()==in.
|
149
|
+
unsigned next_index = nodes[node_index].jump(in.peek());
|
150
|
+
if(nodes[next_index].check_char()==in.read()) {
|
149
151
|
// succeeded
|
150
152
|
node_index = next_index;
|
151
153
|
unsigned terminal_index = nodes[node_index].jump('\0');
|
152
154
|
if(nodes[terminal_index].check_char()=='\0') {
|
153
|
-
|
155
|
+
composed_char_info = nodes[terminal_index].value();
|
156
|
+
|
154
157
|
in.mark_as_last_valid_point();
|
155
158
|
if(in.eos() || retry_root_class > in.get_canonical_class())
|
156
159
|
break;
|
@@ -168,9 +171,9 @@ namespace UNF {
|
|
168
171
|
}
|
169
172
|
}
|
170
173
|
|
171
|
-
if(
|
174
|
+
if(composed_char_info != 0) {
|
172
175
|
// append composed unicode-character and skipped combining-characters
|
173
|
-
buf
|
176
|
+
word_append(buf, value, composed_char_info);
|
174
177
|
in.append_skipped_chars_to_str(buf);
|
175
178
|
in.reset_at_marked_point();
|
176
179
|
} else {
|
@@ -179,6 +182,11 @@ namespace UNF {
|
|
179
182
|
in.append_read_char_to_str(buf, beg);
|
180
183
|
}
|
181
184
|
}
|
185
|
+
|
186
|
+
private:
|
187
|
+
static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
|
188
|
+
buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
|
189
|
+
}
|
182
190
|
};
|
183
191
|
}
|
184
192
|
}
|
File without changes
|
data/lib/unf_ext.rb
ADDED
data/unf_ext.gemspec
CHANGED
@@ -5,14 +5,14 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{unf_ext}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Takeru Ohta}, %q{Akinori MUSHA}]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-12-08}
|
13
13
|
s.description = %q{Unicode Normalization Form support library for CRuby}
|
14
14
|
s.email = %q{knu@idaemons.org}
|
15
|
-
s.extensions = [%q{extconf.rb}]
|
15
|
+
s.extensions = [%q{ext/unf_ext/extconf.rb}]
|
16
16
|
s.extra_rdoc_files = [
|
17
17
|
"LICENSE.txt",
|
18
18
|
"README.md"
|
@@ -24,17 +24,18 @@ Gem::Specification.new do |s|
|
|
24
24
|
"README.md",
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
|
-
"extconf.rb",
|
27
|
+
"ext/unf_ext/extconf.rb",
|
28
|
+
"ext/unf_ext/unf.cc",
|
29
|
+
"ext/unf_ext/unf/normalizer.hh",
|
30
|
+
"ext/unf_ext/unf/table.hh",
|
31
|
+
"ext/unf_ext/unf/trie/char_stream.hh",
|
32
|
+
"ext/unf_ext/unf/trie/node.hh",
|
33
|
+
"ext/unf_ext/unf/trie/searcher.hh",
|
34
|
+
"ext/unf_ext/unf/util.hh",
|
35
|
+
"lib/unf_ext.rb",
|
28
36
|
"test/helper.rb",
|
29
37
|
"test/normalization-test.txt",
|
30
38
|
"test/test_unf_ext.rb",
|
31
|
-
"unf.cc",
|
32
|
-
"unf/normalizer.hh",
|
33
|
-
"unf/table.hh",
|
34
|
-
"unf/trie/char_stream.hh",
|
35
|
-
"unf/trie/node.hh",
|
36
|
-
"unf/trie/searcher.hh",
|
37
|
-
"unf/util.hh",
|
38
39
|
"unf_ext.gemspec"
|
39
40
|
]
|
40
41
|
s.homepage = %q{http://github.com/knu/ruby-unf_ext}
|
@@ -51,17 +52,20 @@ Gem::Specification.new do |s|
|
|
51
52
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
52
53
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
53
54
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
+
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
54
56
|
else
|
55
57
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
56
58
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
59
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
58
60
|
s.add_dependency(%q<rcov>, [">= 0"])
|
61
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
59
62
|
end
|
60
63
|
else
|
61
64
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
62
65
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
63
66
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
64
67
|
s.add_dependency(%q<rcov>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.9"])
|
65
69
|
end
|
66
70
|
end
|
67
71
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unf_ext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Takeru Ohta
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-12-08 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: shoulda
|
@@ -78,12 +78,28 @@ dependencies:
|
|
78
78
|
type: :development
|
79
79
|
requirement: *id004
|
80
80
|
prerelease: false
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: rake-compiler
|
83
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
hash: 17
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
- 7
|
92
|
+
- 9
|
93
|
+
version: 0.7.9
|
94
|
+
type: :development
|
95
|
+
requirement: *id005
|
96
|
+
prerelease: false
|
81
97
|
description: Unicode Normalization Form support library for CRuby
|
82
98
|
email: knu@idaemons.org
|
83
99
|
executables: []
|
84
100
|
|
85
101
|
extensions:
|
86
|
-
- extconf.rb
|
102
|
+
- ext/unf_ext/extconf.rb
|
87
103
|
extra_rdoc_files:
|
88
104
|
- LICENSE.txt
|
89
105
|
- README.md
|
@@ -94,17 +110,18 @@ files:
|
|
94
110
|
- README.md
|
95
111
|
- Rakefile
|
96
112
|
- VERSION
|
97
|
-
- extconf.rb
|
113
|
+
- ext/unf_ext/extconf.rb
|
114
|
+
- ext/unf_ext/unf.cc
|
115
|
+
- ext/unf_ext/unf/normalizer.hh
|
116
|
+
- ext/unf_ext/unf/table.hh
|
117
|
+
- ext/unf_ext/unf/trie/char_stream.hh
|
118
|
+
- ext/unf_ext/unf/trie/node.hh
|
119
|
+
- ext/unf_ext/unf/trie/searcher.hh
|
120
|
+
- ext/unf_ext/unf/util.hh
|
121
|
+
- lib/unf_ext.rb
|
98
122
|
- test/helper.rb
|
99
123
|
- test/normalization-test.txt
|
100
124
|
- test/test_unf_ext.rb
|
101
|
-
- unf.cc
|
102
|
-
- unf/normalizer.hh
|
103
|
-
- unf/table.hh
|
104
|
-
- unf/trie/char_stream.hh
|
105
|
-
- unf/trie/node.hh
|
106
|
-
- unf/trie/searcher.hh
|
107
|
-
- unf/util.hh
|
108
125
|
- unf_ext.gemspec
|
109
126
|
homepage: http://github.com/knu/ruby-unf_ext
|
110
127
|
licenses:
|
data/extconf.rb
DELETED