ruby-sfst 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
@@ -14,19 +14,19 @@ VALUE mSFST = Qnil;
|
|
14
14
|
VALUE mCompactTransducer = Qnil;
|
15
15
|
VALUE mRegularTransducer = Qnil;
|
16
16
|
|
17
|
-
static void compact_transducer_free(CompactTransducer *t)
|
17
|
+
static void compact_transducer_free(SFST::CompactTransducer *t)
|
18
18
|
{
|
19
19
|
if (t)
|
20
20
|
delete t;
|
21
21
|
}
|
22
22
|
|
23
|
-
static void compact_transducer_mark(CompactTransducer *t)
|
23
|
+
static void compact_transducer_mark(SFST::CompactTransducer *t)
|
24
24
|
{
|
25
25
|
}
|
26
26
|
|
27
27
|
static VALUE compact_transducer_alloc(VALUE klass)
|
28
28
|
{
|
29
|
-
CompactTransducer *t = NULL;
|
29
|
+
SFST::CompactTransducer *t = NULL;
|
30
30
|
|
31
31
|
return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
|
32
32
|
}
|
@@ -34,7 +34,7 @@ static VALUE compact_transducer_alloc(VALUE klass)
|
|
34
34
|
static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
35
35
|
{
|
36
36
|
FILE *file;
|
37
|
-
CompactTransducer *t;
|
37
|
+
SFST::CompactTransducer *t;
|
38
38
|
|
39
39
|
file = fopen(RSTRING_PTR(filename), "rb");
|
40
40
|
|
@@ -43,7 +43,7 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
|
43
43
|
}
|
44
44
|
|
45
45
|
try {
|
46
|
-
t = new CompactTransducer(file);
|
46
|
+
t = new SFST::CompactTransducer(file);
|
47
47
|
fclose(file);
|
48
48
|
}
|
49
49
|
catch (const char *p) {
|
@@ -57,13 +57,13 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
|
57
57
|
static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
58
58
|
{
|
59
59
|
VALUE accepted = Qfalse;
|
60
|
-
CompactTransducer *t;
|
60
|
+
SFST::CompactTransducer *t;
|
61
61
|
|
62
62
|
Check_Type(string, T_STRING);
|
63
63
|
|
64
|
-
Data_Get_Struct(self, CompactTransducer, t);
|
64
|
+
Data_Get_Struct(self, SFST::CompactTransducer, t);
|
65
65
|
|
66
|
-
std::vector<CAnalysis> analyses;
|
66
|
+
std::vector<SFST::CAnalysis> analyses;
|
67
67
|
t->analyze_string(RSTRING_PTR(string), analyses);
|
68
68
|
|
69
69
|
for (size_t k = 0; k < analyses.size(); k++) {
|
@@ -79,19 +79,19 @@ static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
|
79
79
|
return accepted;
|
80
80
|
}
|
81
81
|
|
82
|
-
static void regular_transducer_free(Transducer *t)
|
82
|
+
static void regular_transducer_free(SFST::Transducer *t)
|
83
83
|
{
|
84
84
|
if (t)
|
85
85
|
delete t;
|
86
86
|
}
|
87
87
|
|
88
|
-
static void regular_transducer_mark(Transducer *t)
|
88
|
+
static void regular_transducer_mark(SFST::Transducer *t)
|
89
89
|
{
|
90
90
|
}
|
91
91
|
|
92
92
|
static VALUE regular_transducer_alloc(VALUE klass)
|
93
93
|
{
|
94
|
-
Transducer *t = NULL;
|
94
|
+
SFST::Transducer *t = NULL;
|
95
95
|
|
96
96
|
return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
|
97
97
|
}
|
@@ -99,7 +99,7 @@ static VALUE regular_transducer_alloc(VALUE klass)
|
|
99
99
|
static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
100
100
|
{
|
101
101
|
FILE *file;
|
102
|
-
Transducer *t;
|
102
|
+
SFST::Transducer *t;
|
103
103
|
|
104
104
|
file = fopen(RSTRING_PTR(filename), "rb");
|
105
105
|
|
@@ -108,7 +108,7 @@ static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
|
108
108
|
}
|
109
109
|
|
110
110
|
try {
|
111
|
-
t = new Transducer(file);
|
111
|
+
t = new SFST::Transducer(file);
|
112
112
|
fclose(file);
|
113
113
|
}
|
114
114
|
catch (const char *p) {
|
@@ -123,7 +123,7 @@ using std::vector;
|
|
123
123
|
|
124
124
|
enum { BOTH, LOWER, UPPER };
|
125
125
|
|
126
|
-
static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
|
126
|
+
static VALUE _alphabet_to_rb_str(SFST::Alphabet *a, SFST::Character c)
|
127
127
|
{
|
128
128
|
const char *s = a->code2symbol(c);
|
129
129
|
|
@@ -151,68 +151,79 @@ static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
|
|
151
151
|
return ary;
|
152
152
|
}
|
153
153
|
|
154
|
-
|
155
|
-
|
156
|
-
{
|
157
|
-
if (node->is_final())
|
158
|
-
rb_yield(a);
|
159
|
-
|
160
|
-
visitations[node]++;
|
161
|
-
|
162
|
-
vector<Arc*> arc;
|
163
|
-
for (ArcsIter p(node->arcs()); p; p++) {
|
164
|
-
Arc *a = p;
|
165
|
-
Node *n = a->target_node();
|
166
|
-
size_t i;
|
167
|
-
for (i = 0; i < arc.size(); i++)
|
168
|
-
if (visitations[n] < visitations[arc[i]->target_node()])
|
169
|
-
break;
|
170
|
-
arc.push_back(NULL);
|
171
|
-
for (size_t k = arc.size() - 1; k > i; k--)
|
172
|
-
arc[k] = arc[k - 1];
|
173
|
-
arc[i] = a;
|
174
|
-
}
|
154
|
+
class Gen {
|
155
|
+
public:
|
175
156
|
|
176
|
-
|
177
|
-
|
178
|
-
|
157
|
+
SFST::Node *node;
|
158
|
+
SFST::Index previous;
|
159
|
+
SFST::Label label;
|
179
160
|
|
180
|
-
|
181
|
-
|
182
|
-
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
183
|
-
} else
|
184
|
-
lower = Qnil;
|
161
|
+
Gen(SFST::Node *n, SFST::Label l = SFST::Label::epsilon, SFST::Index p = SFST::undef):
|
162
|
+
node(n), previous(p), label(l) {}
|
185
163
|
|
186
|
-
|
187
|
-
if (
|
188
|
-
|
189
|
-
|
190
|
-
|
164
|
+
void print(vector<Gen> &paths, VALUE a, int levels, bool epsilons, SFST::Transducer *t) {
|
165
|
+
if (previous != SFST::undef) {
|
166
|
+
paths[previous].print(paths, a, levels, epsilons, t);
|
167
|
+
|
168
|
+
SFST::Label l = label;
|
169
|
+
|
170
|
+
VALUE lower, upper;
|
171
|
+
|
172
|
+
SFST::Character lc = l.lower_char();
|
173
|
+
if ((levels == BOTH || levels == LOWER) && (epsilons || lc != SFST::Label::epsilon)) {
|
174
|
+
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
175
|
+
} else
|
176
|
+
lower = Qnil;
|
177
|
+
|
178
|
+
SFST::Character uc = l.upper_char();
|
179
|
+
if ((levels == BOTH || levels == UPPER) && (epsilons || uc != SFST::Label::epsilon)) {
|
180
|
+
upper = _alphabet_to_rb_str(&(t->alphabet), uc);
|
181
|
+
} else
|
182
|
+
upper = Qnil;
|
191
183
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
switch (levels) {
|
185
|
+
case BOTH:
|
186
|
+
rb_ary_push_pair(a, lower, upper);
|
187
|
+
break;
|
196
188
|
|
197
|
-
|
198
|
-
|
199
|
-
|
189
|
+
case UPPER:
|
190
|
+
rb_ary_push(a, upper);
|
191
|
+
break;
|
200
192
|
|
201
|
-
|
202
|
-
|
203
|
-
|
193
|
+
case LOWER:
|
194
|
+
rb_ary_push(a, lower);
|
195
|
+
break;
|
196
|
+
}
|
204
197
|
}
|
198
|
+
}
|
199
|
+
};
|
200
|
+
|
201
|
+
static void _generate(SFST::Transducer *t, int levels, bool epsilons)
|
202
|
+
{
|
203
|
+
vector<Gen> paths;
|
204
|
+
paths.push_back(Gen(t->root_node()));
|
205
|
+
|
206
|
+
for (size_t i = 0; i < paths.size(); i++) {
|
207
|
+
Gen &gen = paths[i];
|
208
|
+
SFST::Node *node = gen.node;
|
205
209
|
|
206
|
-
|
210
|
+
if (node->is_final()) {
|
211
|
+
VALUE a = rb_ary_new();
|
212
|
+
gen.print(paths, a, levels, epsilons, t);
|
213
|
+
rb_yield(a);
|
214
|
+
}
|
207
215
|
|
208
|
-
|
216
|
+
for (SFST::ArcsIter p(node->arcs()); p; p++) {
|
217
|
+
SFST::Arc *arc = p;
|
218
|
+
paths.push_back(Gen(arc->target_node(), arc->label(), (SFST::Index)i));
|
219
|
+
}
|
209
220
|
}
|
210
221
|
}
|
211
222
|
|
212
223
|
static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
|
213
224
|
{
|
214
|
-
Transducer *t;
|
215
|
-
Data_Get_Struct(self, Transducer, t);
|
225
|
+
SFST::Transducer *t;
|
226
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
216
227
|
|
217
228
|
static ID id_upper = rb_intern("upper");
|
218
229
|
static ID id_lower = rb_intern("lower");
|
@@ -246,8 +257,8 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
|
|
246
257
|
if (!rb_block_given_p())
|
247
258
|
rb_raise(rb_eRuntimeError, "block expected");
|
248
259
|
|
249
|
-
|
250
|
-
|
260
|
+
SFST::Transducer *a2;
|
261
|
+
|
251
262
|
switch (levels) {
|
252
263
|
case UPPER:
|
253
264
|
a2 = &(t->upper_level().minimise());
|
@@ -259,13 +270,13 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
|
|
259
270
|
a2 = t;
|
260
271
|
break;
|
261
272
|
}
|
262
|
-
|
263
|
-
|
273
|
+
|
274
|
+
_generate(a2, levels, epsilons);
|
264
275
|
|
265
276
|
return Qnil;
|
266
277
|
}
|
267
278
|
|
268
|
-
static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
|
279
|
+
static bool _regular_transducer_yield(SFST::Transducer *t, SFST::Node *node, VALUE result_array)
|
269
280
|
{
|
270
281
|
int accepted = 0;
|
271
282
|
|
@@ -284,9 +295,9 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
|
|
284
295
|
accepted = 1;
|
285
296
|
}
|
286
297
|
|
287
|
-
for (ArcsIter i(node->arcs()); i; i++) {
|
288
|
-
Arc *arc = i;
|
289
|
-
Label l = arc->label();
|
298
|
+
for (SFST::ArcsIter i(node->arcs()); i; i++) {
|
299
|
+
SFST::Arc *arc = i;
|
300
|
+
SFST::Label l = arc->label();
|
290
301
|
|
291
302
|
rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
|
292
303
|
|
@@ -300,10 +311,10 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
|
|
300
311
|
return accepted == 1 ? true : false;
|
301
312
|
}
|
302
313
|
|
303
|
-
static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
|
314
|
+
static VALUE _regular_transducer_analyze_or_generate(SFST::Transducer *t, VALUE string, bool generate)
|
304
315
|
{
|
305
|
-
Transducer *a2, *a3;
|
306
|
-
Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
|
316
|
+
SFST::Transducer *a2, *a3;
|
317
|
+
SFST::Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
|
307
318
|
if (generate) {
|
308
319
|
a2 = &(a1 || *t);
|
309
320
|
a3 = &(a2->upper_level());
|
@@ -324,23 +335,23 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
|
|
324
335
|
|
325
336
|
static VALUE regular_transducer_generate(VALUE self, VALUE string)
|
326
337
|
{
|
327
|
-
Transducer *t;
|
338
|
+
SFST::Transducer *t;
|
328
339
|
Check_Type(string, T_STRING);
|
329
|
-
Data_Get_Struct(self, Transducer, t);
|
340
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
330
341
|
return _regular_transducer_analyze_or_generate(t, string, true);
|
331
342
|
}
|
332
343
|
|
333
344
|
static VALUE regular_transducer_analyze(VALUE self, VALUE string)
|
334
345
|
{
|
335
|
-
Transducer *t;
|
346
|
+
SFST::Transducer *t;
|
336
347
|
Check_Type(string, T_STRING);
|
337
|
-
Data_Get_Struct(self, Transducer, t);
|
348
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
338
349
|
return _regular_transducer_analyze_or_generate(t, string, false);
|
339
350
|
}
|
340
351
|
|
341
352
|
extern "C"
|
342
353
|
|
343
|
-
void
|
354
|
+
void Init_sfst(void)
|
344
355
|
{
|
345
356
|
mSFST = rb_define_module("SFST");
|
346
357
|
|
data/ext/sfst/sgi.h
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* File: sgi.h */
|
4
|
+
/* Author: Helmut Schmid */
|
5
|
+
/* Purpose: */
|
6
|
+
/* Created: Thu Sep 11 15:58:25 2008 */
|
7
|
+
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
8
|
+
/* Modified: Wed May 26 12:54:00 2010 (hfst) */
|
9
|
+
/*******************************************************************/
|
10
|
+
|
11
|
+
#ifndef _SGI_INCLUDED
|
12
|
+
#define _SGI_INCLUDED
|
13
|
+
|
14
|
+
#if HAVE_CONFIG_H
|
15
|
+
# include <config.h>
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#if HAVE_BACKWARD_HASH_MAP
|
19
|
+
# include <backward/hash_map>
|
20
|
+
#elif HAVE_EXT_HASH_MAP
|
21
|
+
# include <ext/hash_map>
|
22
|
+
#elif HAVE_HASH_MAP
|
23
|
+
# include <hash_map>
|
24
|
+
#elif SGIext
|
25
|
+
# include <ext/hash_map>
|
26
|
+
#elif SGI__gnu_cxx
|
27
|
+
# include <ext/hash_map>
|
28
|
+
#else
|
29
|
+
# warning "unknown hash_map"
|
30
|
+
# include <hash_map>
|
31
|
+
#endif
|
32
|
+
#if HAVE_BACKWARD_HASH_SET
|
33
|
+
# include <backward/hash_set>
|
34
|
+
#elif HAVE_EXT_HASH_SET
|
35
|
+
# include <ext/hash_set>
|
36
|
+
#elif HAVE_HASH_SET
|
37
|
+
# include <hash_set>
|
38
|
+
#elif SGIext
|
39
|
+
# include <ext/hash_set>
|
40
|
+
#elif SGI__gnu_cxx
|
41
|
+
# include <ext/hash_set>
|
42
|
+
#else
|
43
|
+
# warning "missing hash_set"
|
44
|
+
# include <hash_set>
|
45
|
+
#endif
|
46
|
+
|
47
|
+
// Hfst addition
|
48
|
+
namespace SFST
|
49
|
+
{
|
50
|
+
// from <http://gcc.gnu.org/onlinedocs/libstdc++/manual/backwards.html>
|
51
|
+
#ifdef __GNUC__
|
52
|
+
# if __GNUC__ < 3
|
53
|
+
using ::hash_map;
|
54
|
+
using ::hash_set;
|
55
|
+
using ::hash;
|
56
|
+
# elif __GNUC__ == 3 && __GNUC_MINOR__ == 0
|
57
|
+
using std::hash_map;
|
58
|
+
using std::hash_set;
|
59
|
+
using std::hash;
|
60
|
+
# else
|
61
|
+
using __gnu_cxx::hash_map;
|
62
|
+
using __gnu_cxx::hash_set;
|
63
|
+
using __gnu_cxx::hash;
|
64
|
+
# endif
|
65
|
+
#else
|
66
|
+
using std::hash_map;
|
67
|
+
using std::hash_set;
|
68
|
+
using std::hash;
|
69
|
+
#endif
|
70
|
+
}
|
71
|
+
|
72
|
+
#endif
|
data/ext/sfst/utf8.cc
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.C */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Wed Sep 29 15:08:34 2010 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include "string.h"
|
13
|
+
#include "utf8.h"
|
14
|
+
|
15
|
+
namespace SFST {
|
16
|
+
|
17
|
+
const unsigned char get3LSbits=7;
|
18
|
+
const unsigned char get4LSbits=15;
|
19
|
+
const unsigned char get5LSbits=31;
|
20
|
+
const unsigned char get6LSbits=63;
|
21
|
+
|
22
|
+
const unsigned char set1MSbits=128;
|
23
|
+
const unsigned char set2MSbits=192;
|
24
|
+
const unsigned char set3MSbits=224;
|
25
|
+
const unsigned char set4MSbits=240;
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
/*******************************************************************/
|
30
|
+
/* */
|
31
|
+
/* int2utf8 */
|
32
|
+
/* */
|
33
|
+
/*******************************************************************/
|
34
|
+
|
35
|
+
char *int2utf8( unsigned int sym )
|
36
|
+
|
37
|
+
{
|
38
|
+
static unsigned char ch[5];
|
39
|
+
|
40
|
+
if (sym < 128) {
|
41
|
+
// 1-byte UTF8 symbol, 7 bits
|
42
|
+
ch[0] = (unsigned char)sym;
|
43
|
+
ch[1] = 0;
|
44
|
+
}
|
45
|
+
|
46
|
+
else if (sym < 2048) {
|
47
|
+
// 2-byte UTF8 symbol, 5+6 bits
|
48
|
+
ch[0] = (unsigned char)((sym >> 6) | set2MSbits);
|
49
|
+
ch[1] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
50
|
+
ch[2] = 0;
|
51
|
+
}
|
52
|
+
|
53
|
+
else if (sym < 65536) {
|
54
|
+
// 3-byte UTF8 symbol, 4+6+6 bits
|
55
|
+
ch[0] = (unsigned char)((sym >> 12) | set3MSbits);
|
56
|
+
ch[1] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
|
57
|
+
ch[2] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
58
|
+
ch[3] = 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
else if (sym < 2097152) {
|
62
|
+
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
63
|
+
ch[0] = (unsigned char)((sym >> 18) | set4MSbits);
|
64
|
+
ch[1] = (unsigned char)(((sym >> 12) & get6LSbits) | set1MSbits);
|
65
|
+
ch[2] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
|
66
|
+
ch[3] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
67
|
+
ch[4] = 0;
|
68
|
+
}
|
69
|
+
|
70
|
+
else
|
71
|
+
return NULL;
|
72
|
+
|
73
|
+
return (char*)ch;
|
74
|
+
}
|
75
|
+
|
76
|
+
|
77
|
+
/*******************************************************************/
|
78
|
+
/* */
|
79
|
+
/* utf8toint */
|
80
|
+
/* */
|
81
|
+
/*******************************************************************/
|
82
|
+
|
83
|
+
unsigned int utf8toint( char **s )
|
84
|
+
|
85
|
+
{
|
86
|
+
int bytes_to_come;
|
87
|
+
unsigned int result=0;
|
88
|
+
unsigned char c=(unsigned char)**s;
|
89
|
+
|
90
|
+
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
91
|
+
bytes_to_come = 3;
|
92
|
+
result = (result << 3) | (c & get3LSbits);
|
93
|
+
}
|
94
|
+
|
95
|
+
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
96
|
+
// start of a three-byte symbol
|
97
|
+
bytes_to_come = 2;
|
98
|
+
result = (result << 4) | (c & get4LSbits);
|
99
|
+
}
|
100
|
+
|
101
|
+
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
102
|
+
// start of a two-byte symbol
|
103
|
+
bytes_to_come = 1;
|
104
|
+
result = (result << 5) | (c & get5LSbits);
|
105
|
+
}
|
106
|
+
|
107
|
+
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
108
|
+
// one-byte symbol
|
109
|
+
bytes_to_come = 0;
|
110
|
+
result = c;
|
111
|
+
}
|
112
|
+
|
113
|
+
else
|
114
|
+
return 0; // error
|
115
|
+
|
116
|
+
while (bytes_to_come > 0) {
|
117
|
+
bytes_to_come--;
|
118
|
+
(*s)++;
|
119
|
+
c = (unsigned char)**s;
|
120
|
+
if (c < (unsigned char) set2MSbits &&
|
121
|
+
c >= (unsigned char) set1MSbits) // 1000xxxx
|
122
|
+
{
|
123
|
+
result = (result << 6) | (c & get6LSbits);
|
124
|
+
}
|
125
|
+
else
|
126
|
+
return 0;
|
127
|
+
}
|
128
|
+
|
129
|
+
(*s)++;
|
130
|
+
return result;
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
/*******************************************************************/
|
135
|
+
/* */
|
136
|
+
/* utf8toint */
|
137
|
+
/* */
|
138
|
+
/*******************************************************************/
|
139
|
+
|
140
|
+
unsigned int utf8toint( char *s )
|
141
|
+
|
142
|
+
{
|
143
|
+
unsigned int result = utf8toint( &s );
|
144
|
+
if (*s == 0) // all bytes converted?
|
145
|
+
return result;
|
146
|
+
return 0;
|
147
|
+
}
|
148
|
+
|
149
|
+
}
|