ruby-sfst 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
@@ -14,19 +14,19 @@ VALUE mSFST = Qnil;
|
|
14
14
|
VALUE mCompactTransducer = Qnil;
|
15
15
|
VALUE mRegularTransducer = Qnil;
|
16
16
|
|
17
|
-
static void compact_transducer_free(CompactTransducer *t)
|
17
|
+
static void compact_transducer_free(SFST::CompactTransducer *t)
|
18
18
|
{
|
19
19
|
if (t)
|
20
20
|
delete t;
|
21
21
|
}
|
22
22
|
|
23
|
-
static void compact_transducer_mark(CompactTransducer *t)
|
23
|
+
static void compact_transducer_mark(SFST::CompactTransducer *t)
|
24
24
|
{
|
25
25
|
}
|
26
26
|
|
27
27
|
static VALUE compact_transducer_alloc(VALUE klass)
|
28
28
|
{
|
29
|
-
CompactTransducer *t = NULL;
|
29
|
+
SFST::CompactTransducer *t = NULL;
|
30
30
|
|
31
31
|
return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
|
32
32
|
}
|
@@ -34,7 +34,7 @@ static VALUE compact_transducer_alloc(VALUE klass)
|
|
34
34
|
static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
35
35
|
{
|
36
36
|
FILE *file;
|
37
|
-
CompactTransducer *t;
|
37
|
+
SFST::CompactTransducer *t;
|
38
38
|
|
39
39
|
file = fopen(RSTRING_PTR(filename), "rb");
|
40
40
|
|
@@ -43,7 +43,7 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
|
43
43
|
}
|
44
44
|
|
45
45
|
try {
|
46
|
-
t = new CompactTransducer(file);
|
46
|
+
t = new SFST::CompactTransducer(file);
|
47
47
|
fclose(file);
|
48
48
|
}
|
49
49
|
catch (const char *p) {
|
@@ -57,13 +57,13 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
|
57
57
|
static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
58
58
|
{
|
59
59
|
VALUE accepted = Qfalse;
|
60
|
-
CompactTransducer *t;
|
60
|
+
SFST::CompactTransducer *t;
|
61
61
|
|
62
62
|
Check_Type(string, T_STRING);
|
63
63
|
|
64
|
-
Data_Get_Struct(self, CompactTransducer, t);
|
64
|
+
Data_Get_Struct(self, SFST::CompactTransducer, t);
|
65
65
|
|
66
|
-
std::vector<CAnalysis> analyses;
|
66
|
+
std::vector<SFST::CAnalysis> analyses;
|
67
67
|
t->analyze_string(RSTRING_PTR(string), analyses);
|
68
68
|
|
69
69
|
for (size_t k = 0; k < analyses.size(); k++) {
|
@@ -79,19 +79,19 @@ static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
|
79
79
|
return accepted;
|
80
80
|
}
|
81
81
|
|
82
|
-
static void regular_transducer_free(Transducer *t)
|
82
|
+
static void regular_transducer_free(SFST::Transducer *t)
|
83
83
|
{
|
84
84
|
if (t)
|
85
85
|
delete t;
|
86
86
|
}
|
87
87
|
|
88
|
-
static void regular_transducer_mark(Transducer *t)
|
88
|
+
static void regular_transducer_mark(SFST::Transducer *t)
|
89
89
|
{
|
90
90
|
}
|
91
91
|
|
92
92
|
static VALUE regular_transducer_alloc(VALUE klass)
|
93
93
|
{
|
94
|
-
Transducer *t = NULL;
|
94
|
+
SFST::Transducer *t = NULL;
|
95
95
|
|
96
96
|
return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
|
97
97
|
}
|
@@ -99,7 +99,7 @@ static VALUE regular_transducer_alloc(VALUE klass)
|
|
99
99
|
static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
100
100
|
{
|
101
101
|
FILE *file;
|
102
|
-
Transducer *t;
|
102
|
+
SFST::Transducer *t;
|
103
103
|
|
104
104
|
file = fopen(RSTRING_PTR(filename), "rb");
|
105
105
|
|
@@ -108,7 +108,7 @@ static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
|
108
108
|
}
|
109
109
|
|
110
110
|
try {
|
111
|
-
t = new Transducer(file);
|
111
|
+
t = new SFST::Transducer(file);
|
112
112
|
fclose(file);
|
113
113
|
}
|
114
114
|
catch (const char *p) {
|
@@ -123,7 +123,7 @@ using std::vector;
|
|
123
123
|
|
124
124
|
enum { BOTH, LOWER, UPPER };
|
125
125
|
|
126
|
-
static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
|
126
|
+
static VALUE _alphabet_to_rb_str(SFST::Alphabet *a, SFST::Character c)
|
127
127
|
{
|
128
128
|
const char *s = a->code2symbol(c);
|
129
129
|
|
@@ -151,68 +151,79 @@ static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
|
|
151
151
|
return ary;
|
152
152
|
}
|
153
153
|
|
154
|
-
|
155
|
-
|
156
|
-
{
|
157
|
-
if (node->is_final())
|
158
|
-
rb_yield(a);
|
159
|
-
|
160
|
-
visitations[node]++;
|
161
|
-
|
162
|
-
vector<Arc*> arc;
|
163
|
-
for (ArcsIter p(node->arcs()); p; p++) {
|
164
|
-
Arc *a = p;
|
165
|
-
Node *n = a->target_node();
|
166
|
-
size_t i;
|
167
|
-
for (i = 0; i < arc.size(); i++)
|
168
|
-
if (visitations[n] < visitations[arc[i]->target_node()])
|
169
|
-
break;
|
170
|
-
arc.push_back(NULL);
|
171
|
-
for (size_t k = arc.size() - 1; k > i; k--)
|
172
|
-
arc[k] = arc[k - 1];
|
173
|
-
arc[i] = a;
|
174
|
-
}
|
154
|
+
class Gen {
|
155
|
+
public:
|
175
156
|
|
176
|
-
|
177
|
-
|
178
|
-
|
157
|
+
SFST::Node *node;
|
158
|
+
SFST::Index previous;
|
159
|
+
SFST::Label label;
|
179
160
|
|
180
|
-
|
181
|
-
|
182
|
-
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
183
|
-
} else
|
184
|
-
lower = Qnil;
|
161
|
+
Gen(SFST::Node *n, SFST::Label l = SFST::Label::epsilon, SFST::Index p = SFST::undef):
|
162
|
+
node(n), previous(p), label(l) {}
|
185
163
|
|
186
|
-
|
187
|
-
if (
|
188
|
-
|
189
|
-
|
190
|
-
|
164
|
+
void print(vector<Gen> &paths, VALUE a, int levels, bool epsilons, SFST::Transducer *t) {
|
165
|
+
if (previous != SFST::undef) {
|
166
|
+
paths[previous].print(paths, a, levels, epsilons, t);
|
167
|
+
|
168
|
+
SFST::Label l = label;
|
169
|
+
|
170
|
+
VALUE lower, upper;
|
171
|
+
|
172
|
+
SFST::Character lc = l.lower_char();
|
173
|
+
if ((levels == BOTH || levels == LOWER) && (epsilons || lc != SFST::Label::epsilon)) {
|
174
|
+
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
175
|
+
} else
|
176
|
+
lower = Qnil;
|
177
|
+
|
178
|
+
SFST::Character uc = l.upper_char();
|
179
|
+
if ((levels == BOTH || levels == UPPER) && (epsilons || uc != SFST::Label::epsilon)) {
|
180
|
+
upper = _alphabet_to_rb_str(&(t->alphabet), uc);
|
181
|
+
} else
|
182
|
+
upper = Qnil;
|
191
183
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
switch (levels) {
|
185
|
+
case BOTH:
|
186
|
+
rb_ary_push_pair(a, lower, upper);
|
187
|
+
break;
|
196
188
|
|
197
|
-
|
198
|
-
|
199
|
-
|
189
|
+
case UPPER:
|
190
|
+
rb_ary_push(a, upper);
|
191
|
+
break;
|
200
192
|
|
201
|
-
|
202
|
-
|
203
|
-
|
193
|
+
case LOWER:
|
194
|
+
rb_ary_push(a, lower);
|
195
|
+
break;
|
196
|
+
}
|
204
197
|
}
|
198
|
+
}
|
199
|
+
};
|
200
|
+
|
201
|
+
static void _generate(SFST::Transducer *t, int levels, bool epsilons)
|
202
|
+
{
|
203
|
+
vector<Gen> paths;
|
204
|
+
paths.push_back(Gen(t->root_node()));
|
205
|
+
|
206
|
+
for (size_t i = 0; i < paths.size(); i++) {
|
207
|
+
Gen &gen = paths[i];
|
208
|
+
SFST::Node *node = gen.node;
|
205
209
|
|
206
|
-
|
210
|
+
if (node->is_final()) {
|
211
|
+
VALUE a = rb_ary_new();
|
212
|
+
gen.print(paths, a, levels, epsilons, t);
|
213
|
+
rb_yield(a);
|
214
|
+
}
|
207
215
|
|
208
|
-
|
216
|
+
for (SFST::ArcsIter p(node->arcs()); p; p++) {
|
217
|
+
SFST::Arc *arc = p;
|
218
|
+
paths.push_back(Gen(arc->target_node(), arc->label(), (SFST::Index)i));
|
219
|
+
}
|
209
220
|
}
|
210
221
|
}
|
211
222
|
|
212
223
|
static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
|
213
224
|
{
|
214
|
-
Transducer *t;
|
215
|
-
Data_Get_Struct(self, Transducer, t);
|
225
|
+
SFST::Transducer *t;
|
226
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
216
227
|
|
217
228
|
static ID id_upper = rb_intern("upper");
|
218
229
|
static ID id_lower = rb_intern("lower");
|
@@ -246,8 +257,8 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
|
|
246
257
|
if (!rb_block_given_p())
|
247
258
|
rb_raise(rb_eRuntimeError, "block expected");
|
248
259
|
|
249
|
-
|
250
|
-
|
260
|
+
SFST::Transducer *a2;
|
261
|
+
|
251
262
|
switch (levels) {
|
252
263
|
case UPPER:
|
253
264
|
a2 = &(t->upper_level().minimise());
|
@@ -259,13 +270,13 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
|
|
259
270
|
a2 = t;
|
260
271
|
break;
|
261
272
|
}
|
262
|
-
|
263
|
-
|
273
|
+
|
274
|
+
_generate(a2, levels, epsilons);
|
264
275
|
|
265
276
|
return Qnil;
|
266
277
|
}
|
267
278
|
|
268
|
-
static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
|
279
|
+
static bool _regular_transducer_yield(SFST::Transducer *t, SFST::Node *node, VALUE result_array)
|
269
280
|
{
|
270
281
|
int accepted = 0;
|
271
282
|
|
@@ -284,9 +295,9 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
|
|
284
295
|
accepted = 1;
|
285
296
|
}
|
286
297
|
|
287
|
-
for (ArcsIter i(node->arcs()); i; i++) {
|
288
|
-
Arc *arc = i;
|
289
|
-
Label l = arc->label();
|
298
|
+
for (SFST::ArcsIter i(node->arcs()); i; i++) {
|
299
|
+
SFST::Arc *arc = i;
|
300
|
+
SFST::Label l = arc->label();
|
290
301
|
|
291
302
|
rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
|
292
303
|
|
@@ -300,10 +311,10 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
|
|
300
311
|
return accepted == 1 ? true : false;
|
301
312
|
}
|
302
313
|
|
303
|
-
static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
|
314
|
+
static VALUE _regular_transducer_analyze_or_generate(SFST::Transducer *t, VALUE string, bool generate)
|
304
315
|
{
|
305
|
-
Transducer *a2, *a3;
|
306
|
-
Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
|
316
|
+
SFST::Transducer *a2, *a3;
|
317
|
+
SFST::Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
|
307
318
|
if (generate) {
|
308
319
|
a2 = &(a1 || *t);
|
309
320
|
a3 = &(a2->upper_level());
|
@@ -324,23 +335,23 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
|
|
324
335
|
|
325
336
|
static VALUE regular_transducer_generate(VALUE self, VALUE string)
|
326
337
|
{
|
327
|
-
Transducer *t;
|
338
|
+
SFST::Transducer *t;
|
328
339
|
Check_Type(string, T_STRING);
|
329
|
-
Data_Get_Struct(self, Transducer, t);
|
340
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
330
341
|
return _regular_transducer_analyze_or_generate(t, string, true);
|
331
342
|
}
|
332
343
|
|
333
344
|
static VALUE regular_transducer_analyze(VALUE self, VALUE string)
|
334
345
|
{
|
335
|
-
Transducer *t;
|
346
|
+
SFST::Transducer *t;
|
336
347
|
Check_Type(string, T_STRING);
|
337
|
-
Data_Get_Struct(self, Transducer, t);
|
348
|
+
Data_Get_Struct(self, SFST::Transducer, t);
|
338
349
|
return _regular_transducer_analyze_or_generate(t, string, false);
|
339
350
|
}
|
340
351
|
|
341
352
|
extern "C"
|
342
353
|
|
343
|
-
void
|
354
|
+
void Init_sfst(void)
|
344
355
|
{
|
345
356
|
mSFST = rb_define_module("SFST");
|
346
357
|
|
data/ext/sfst/sgi.h
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* File: sgi.h */
|
4
|
+
/* Author: Helmut Schmid */
|
5
|
+
/* Purpose: */
|
6
|
+
/* Created: Thu Sep 11 15:58:25 2008 */
|
7
|
+
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
8
|
+
/* Modified: Wed May 26 12:54:00 2010 (hfst) */
|
9
|
+
/*******************************************************************/
|
10
|
+
|
11
|
+
#ifndef _SGI_INCLUDED
|
12
|
+
#define _SGI_INCLUDED
|
13
|
+
|
14
|
+
#if HAVE_CONFIG_H
|
15
|
+
# include <config.h>
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#if HAVE_BACKWARD_HASH_MAP
|
19
|
+
# include <backward/hash_map>
|
20
|
+
#elif HAVE_EXT_HASH_MAP
|
21
|
+
# include <ext/hash_map>
|
22
|
+
#elif HAVE_HASH_MAP
|
23
|
+
# include <hash_map>
|
24
|
+
#elif SGIext
|
25
|
+
# include <ext/hash_map>
|
26
|
+
#elif SGI__gnu_cxx
|
27
|
+
# include <ext/hash_map>
|
28
|
+
#else
|
29
|
+
# warning "unknown hash_map"
|
30
|
+
# include <hash_map>
|
31
|
+
#endif
|
32
|
+
#if HAVE_BACKWARD_HASH_SET
|
33
|
+
# include <backward/hash_set>
|
34
|
+
#elif HAVE_EXT_HASH_SET
|
35
|
+
# include <ext/hash_set>
|
36
|
+
#elif HAVE_HASH_SET
|
37
|
+
# include <hash_set>
|
38
|
+
#elif SGIext
|
39
|
+
# include <ext/hash_set>
|
40
|
+
#elif SGI__gnu_cxx
|
41
|
+
# include <ext/hash_set>
|
42
|
+
#else
|
43
|
+
# warning "missing hash_set"
|
44
|
+
# include <hash_set>
|
45
|
+
#endif
|
46
|
+
|
47
|
+
// Hfst addition
|
48
|
+
namespace SFST
|
49
|
+
{
|
50
|
+
// from <http://gcc.gnu.org/onlinedocs/libstdc++/manual/backwards.html>
|
51
|
+
#ifdef __GNUC__
|
52
|
+
# if __GNUC__ < 3
|
53
|
+
using ::hash_map;
|
54
|
+
using ::hash_set;
|
55
|
+
using ::hash;
|
56
|
+
# elif __GNUC__ == 3 && __GNUC_MINOR__ == 0
|
57
|
+
using std::hash_map;
|
58
|
+
using std::hash_set;
|
59
|
+
using std::hash;
|
60
|
+
# else
|
61
|
+
using __gnu_cxx::hash_map;
|
62
|
+
using __gnu_cxx::hash_set;
|
63
|
+
using __gnu_cxx::hash;
|
64
|
+
# endif
|
65
|
+
#else
|
66
|
+
using std::hash_map;
|
67
|
+
using std::hash_set;
|
68
|
+
using std::hash;
|
69
|
+
#endif
|
70
|
+
}
|
71
|
+
|
72
|
+
#endif
|
data/ext/sfst/utf8.cc
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.C */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Wed Sep 29 15:08:34 2010 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include "string.h"
|
13
|
+
#include "utf8.h"
|
14
|
+
|
15
|
+
namespace SFST {
|
16
|
+
|
17
|
+
const unsigned char get3LSbits=7;
|
18
|
+
const unsigned char get4LSbits=15;
|
19
|
+
const unsigned char get5LSbits=31;
|
20
|
+
const unsigned char get6LSbits=63;
|
21
|
+
|
22
|
+
const unsigned char set1MSbits=128;
|
23
|
+
const unsigned char set2MSbits=192;
|
24
|
+
const unsigned char set3MSbits=224;
|
25
|
+
const unsigned char set4MSbits=240;
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
/*******************************************************************/
|
30
|
+
/* */
|
31
|
+
/* int2utf8 */
|
32
|
+
/* */
|
33
|
+
/*******************************************************************/
|
34
|
+
|
35
|
+
char *int2utf8( unsigned int sym )
|
36
|
+
|
37
|
+
{
|
38
|
+
static unsigned char ch[5];
|
39
|
+
|
40
|
+
if (sym < 128) {
|
41
|
+
// 1-byte UTF8 symbol, 7 bits
|
42
|
+
ch[0] = (unsigned char)sym;
|
43
|
+
ch[1] = 0;
|
44
|
+
}
|
45
|
+
|
46
|
+
else if (sym < 2048) {
|
47
|
+
// 2-byte UTF8 symbol, 5+6 bits
|
48
|
+
ch[0] = (unsigned char)((sym >> 6) | set2MSbits);
|
49
|
+
ch[1] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
50
|
+
ch[2] = 0;
|
51
|
+
}
|
52
|
+
|
53
|
+
else if (sym < 65536) {
|
54
|
+
// 3-byte UTF8 symbol, 4+6+6 bits
|
55
|
+
ch[0] = (unsigned char)((sym >> 12) | set3MSbits);
|
56
|
+
ch[1] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
|
57
|
+
ch[2] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
58
|
+
ch[3] = 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
else if (sym < 2097152) {
|
62
|
+
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
63
|
+
ch[0] = (unsigned char)((sym >> 18) | set4MSbits);
|
64
|
+
ch[1] = (unsigned char)(((sym >> 12) & get6LSbits) | set1MSbits);
|
65
|
+
ch[2] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
|
66
|
+
ch[3] = (unsigned char)((sym & get6LSbits) | set1MSbits);
|
67
|
+
ch[4] = 0;
|
68
|
+
}
|
69
|
+
|
70
|
+
else
|
71
|
+
return NULL;
|
72
|
+
|
73
|
+
return (char*)ch;
|
74
|
+
}
|
75
|
+
|
76
|
+
|
77
|
+
/*******************************************************************/
|
78
|
+
/* */
|
79
|
+
/* utf8toint */
|
80
|
+
/* */
|
81
|
+
/*******************************************************************/
|
82
|
+
|
83
|
+
unsigned int utf8toint( char **s )
|
84
|
+
|
85
|
+
{
|
86
|
+
int bytes_to_come;
|
87
|
+
unsigned int result=0;
|
88
|
+
unsigned char c=(unsigned char)**s;
|
89
|
+
|
90
|
+
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
91
|
+
bytes_to_come = 3;
|
92
|
+
result = (result << 3) | (c & get3LSbits);
|
93
|
+
}
|
94
|
+
|
95
|
+
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
96
|
+
// start of a three-byte symbol
|
97
|
+
bytes_to_come = 2;
|
98
|
+
result = (result << 4) | (c & get4LSbits);
|
99
|
+
}
|
100
|
+
|
101
|
+
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
102
|
+
// start of a two-byte symbol
|
103
|
+
bytes_to_come = 1;
|
104
|
+
result = (result << 5) | (c & get5LSbits);
|
105
|
+
}
|
106
|
+
|
107
|
+
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
108
|
+
// one-byte symbol
|
109
|
+
bytes_to_come = 0;
|
110
|
+
result = c;
|
111
|
+
}
|
112
|
+
|
113
|
+
else
|
114
|
+
return 0; // error
|
115
|
+
|
116
|
+
while (bytes_to_come > 0) {
|
117
|
+
bytes_to_come--;
|
118
|
+
(*s)++;
|
119
|
+
c = (unsigned char)**s;
|
120
|
+
if (c < (unsigned char) set2MSbits &&
|
121
|
+
c >= (unsigned char) set1MSbits) // 1000xxxx
|
122
|
+
{
|
123
|
+
result = (result << 6) | (c & get6LSbits);
|
124
|
+
}
|
125
|
+
else
|
126
|
+
return 0;
|
127
|
+
}
|
128
|
+
|
129
|
+
(*s)++;
|
130
|
+
return result;
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
/*******************************************************************/
|
135
|
+
/* */
|
136
|
+
/* utf8toint */
|
137
|
+
/* */
|
138
|
+
/*******************************************************************/
|
139
|
+
|
140
|
+
unsigned int utf8toint( char *s )
|
141
|
+
|
142
|
+
{
|
143
|
+
unsigned int result = utf8toint( &s );
|
144
|
+
if (*s == 0) // all bytes converted?
|
145
|
+
return result;
|
146
|
+
return 0;
|
147
|
+
}
|
148
|
+
|
149
|
+
}
|