ferret 0.10.11 → 0.10.12
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/CHANGELOG
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
|
2
|
+
Fri Oct 13 09:18:31 JST 2006
|
3
|
+
* Changed documentation to state truthfully that FULL_ENGLISH_STOP_WORDS is
|
4
|
+
being used by default in StandardAnalyzer and StopwordFilter.
|
5
|
+
* Removed 'will', 's' and 't' from ENGLISH_STOP_WORDS so that all words in
|
6
|
+
ENGLISH_STOP_WORDS can be found in FULL_ENGLISH_STOP_WORDS, that is
|
7
|
+
ENGLISH_STOP_WORDS is a subset of FULL_ENGLISH_STOP_WORDS.
|
8
|
+
|
9
|
+
Thu Oct 12 23:04:19 JST 2006
|
10
|
+
* Fixed adding SortField to Sort object in Ruby. Garbage collection wasn't
|
11
|
+
working.
|
12
|
+
* Can now set :sort => SortField#new
|
13
|
+
|
1
14
|
Tue Oct 10 14:42:17 JST 2006
|
2
15
|
* Fixed MultiTermDocEnum bug introduced in version 0.10.10 during
|
3
16
|
performance enhancements.
|
data/Rakefile
CHANGED
@@ -41,7 +41,7 @@ SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
|
41
41
|
|
42
42
|
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles',
|
43
43
|
'.config', 'ext/cferret.c'])
|
44
|
-
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
44
|
+
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', 'ext/mem_pool.*', 'ext/defines.h', EXT_SRC_DEST)
|
45
45
|
POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
|
46
46
|
|
47
47
|
desc "Clean specifically for the release."
|
data/ext/analysis.c
CHANGED
@@ -1078,19 +1078,10 @@ static void sf_destroy_i(TokenStream *ts)
|
|
1078
1078
|
filter_destroy_i(ts);
|
1079
1079
|
}
|
1080
1080
|
|
1081
|
-
static void sf_clone_i_i(void *key, void *value, void *arg)
|
1082
|
-
{
|
1083
|
-
HashTable *word_table = (HashTable *)arg;
|
1084
|
-
char *word = estrdup(key);
|
1085
|
-
(void)value;
|
1086
|
-
h_set(word_table, word, word);
|
1087
|
-
}
|
1088
|
-
|
1089
1081
|
static TokenStream *sf_clone_i(TokenStream *orig_ts)
|
1090
1082
|
{
|
1091
|
-
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(
|
1092
|
-
StopFilt(new_ts)->words
|
1093
|
-
h_each(StopFilt(orig_ts)->words, &sf_clone_i_i, StopFilt(new_ts)->words);
|
1083
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
|
1084
|
+
REF(StopFilt(new_ts)->words);
|
1094
1085
|
return new_ts;
|
1095
1086
|
}
|
1096
1087
|
|
@@ -1157,6 +1148,66 @@ TokenStream *stop_filter_new(TokenStream *ts)
|
|
1157
1148
|
return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
|
1158
1149
|
}
|
1159
1150
|
|
1151
|
+
/****************************************************************************
|
1152
|
+
* MappingFilter
|
1153
|
+
****************************************************************************/
|
1154
|
+
|
1155
|
+
#define MFilt(filter) ((MappingFilter *)(filter))
|
1156
|
+
|
1157
|
+
static void mf_destroy_i(TokenStream *ts)
|
1158
|
+
{
|
1159
|
+
mulmap_destroy(MFilt(ts)->mapper);
|
1160
|
+
filter_destroy_i(ts);
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
static TokenStream *mf_clone_i(TokenStream *orig_ts)
|
1164
|
+
{
|
1165
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
|
1166
|
+
REF(MFilt(new_ts)->mapper);
|
1167
|
+
return new_ts;
|
1168
|
+
}
|
1169
|
+
|
1170
|
+
static Token *mf_next(TokenStream *ts)
|
1171
|
+
{
|
1172
|
+
char buf[MAX_WORD_SIZE];
|
1173
|
+
MultiMapper *mapper = MFilt(ts)->mapper;
|
1174
|
+
TokenFilter *tf = TkFilt(ts);
|
1175
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
1176
|
+
if (tk != NULL) {
|
1177
|
+
tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
|
1178
|
+
memcpy(tk->text, buf, tk->len + 1);
|
1179
|
+
}
|
1180
|
+
return tk;
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
static TokenStream *mf_reset(TokenStream *ts, char *text)
|
1184
|
+
{
|
1185
|
+
MultiMapper *mm = MFilt(ts)->mapper;
|
1186
|
+
if (mm->d_size == 0) {
|
1187
|
+
mulmap_compile(MFilt(ts)->mapper);
|
1188
|
+
}
|
1189
|
+
filter_reset(ts, text);
|
1190
|
+
return ts;
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
TokenStream *mapping_filter_new(TokenStream *sub_ts)
|
1194
|
+
{
|
1195
|
+
TokenStream *ts = tf_new(MappingFilter, sub_ts);
|
1196
|
+
MFilt(ts)->mapper = mulmap_new();
|
1197
|
+
ts->next = &mf_next;
|
1198
|
+
ts->destroy_i = &mf_destroy_i;
|
1199
|
+
ts->clone_i = &mf_clone_i;
|
1200
|
+
ts->reset = &mf_reset;
|
1201
|
+
return ts;
|
1202
|
+
}
|
1203
|
+
|
1204
|
+
TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
|
1205
|
+
const char *replacement)
|
1206
|
+
{
|
1207
|
+
mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
|
1208
|
+
return ts;
|
1209
|
+
}
|
1210
|
+
|
1160
1211
|
/****************************************************************************
|
1161
1212
|
* HyphenFilter
|
1162
1213
|
****************************************************************************/
|
data/ext/analysis.h
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "global.h"
|
5
5
|
#include "hash.h"
|
6
|
+
#include "multimapper.h"
|
6
7
|
#include <wchar.h>
|
7
8
|
|
8
9
|
/****************************************************************************
|
@@ -89,6 +90,12 @@ typedef struct StopFilter
|
|
89
90
|
HashTable *words;
|
90
91
|
} StopFilter;
|
91
92
|
|
93
|
+
typedef struct MappingFilter
|
94
|
+
{
|
95
|
+
TokenFilter super;
|
96
|
+
MultiMapper *mapper;
|
97
|
+
} MappingFilter;
|
98
|
+
|
92
99
|
typedef struct HyphenFilter
|
93
100
|
{
|
94
101
|
TokenFilter super;
|
@@ -150,6 +157,10 @@ extern TokenStream *stop_filter_new(TokenStream *ts);
|
|
150
157
|
extern TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
|
151
158
|
const char *charenc);
|
152
159
|
|
160
|
+
extern TokenStream *mapping_filter_new(TokenStream *ts);
|
161
|
+
extern TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
|
162
|
+
const char *replacement);
|
163
|
+
|
153
164
|
/****************************************************************************
|
154
165
|
*
|
155
166
|
* Analyzer
|
data/ext/bitvector.c
CHANGED
@@ -360,32 +360,53 @@ unsigned long bv_hash(BitVector *bv)
|
|
360
360
|
return hash;
|
361
361
|
}
|
362
362
|
|
363
|
+
static __inline void bv_recapa(BitVector *bv, int new_capa)
|
364
|
+
{
|
365
|
+
if (bv->capa < new_capa) {
|
366
|
+
REALLOC_N(bv->bits, f_u32, new_capa);
|
367
|
+
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
368
|
+
sizeof(f_u32) * (new_capa - bv->capa));
|
369
|
+
bv->capa = new_capa;
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
363
373
|
static BitVector *bv_and_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
364
374
|
{
|
365
375
|
int i;
|
366
|
-
int
|
367
|
-
int word_size
|
376
|
+
int size;
|
377
|
+
int word_size;
|
368
378
|
int capa = 4;
|
369
|
-
while (capa < word_size) {
|
370
|
-
capa <<= 1;
|
371
|
-
}
|
372
|
-
REALLOC_N(bv->bits, f_u32, capa);
|
373
|
-
bv->capa = capa;
|
374
|
-
bv->size = min_size;
|
375
379
|
|
376
380
|
if (bv1->extends_as_ones && bv2->extends_as_ones) {
|
381
|
+
size = max2(bv1->size, bv2->size);
|
377
382
|
bv->extends_as_ones = true;
|
378
383
|
}
|
384
|
+
else if (bv1->extends_as_ones || bv2->extends_as_ones) {
|
385
|
+
size = max2(bv1->size, bv2->size);
|
386
|
+
bv->extends_as_ones = false;
|
387
|
+
}
|
379
388
|
else {
|
389
|
+
size = min2(bv1->size, bv2->size);
|
380
390
|
bv->extends_as_ones = false;
|
381
391
|
}
|
382
392
|
|
393
|
+
word_size = (size >> 5) + 1;
|
394
|
+
while (capa < word_size) {
|
395
|
+
capa <<= 1;
|
396
|
+
}
|
397
|
+
bv_recapa(bv1, capa);
|
398
|
+
bv_recapa(bv2, capa);
|
399
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
400
|
+
bv->capa = capa;
|
401
|
+
bv->size = size;
|
402
|
+
|
383
403
|
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
384
404
|
sizeof(f_u32) * (capa - word_size));
|
385
405
|
|
386
406
|
for (i = 0; i < word_size; i++) {
|
387
407
|
bv->bits[i] = bv1->bits[i] & bv2->bits[i];
|
388
408
|
}
|
409
|
+
|
389
410
|
bv_recount(bv);
|
390
411
|
return bv;
|
391
412
|
}
|
@@ -400,16 +421,6 @@ BitVector *bv_and_x(BitVector *bv1, BitVector *bv2)
|
|
400
421
|
return bv_and_i(bv1, bv1, bv2);
|
401
422
|
}
|
402
423
|
|
403
|
-
static __inline void bv_recapa(BitVector *bv, int new_capa)
|
404
|
-
{
|
405
|
-
if (bv->capa < new_capa) {
|
406
|
-
REALLOC_N(bv->bits, f_u32, new_capa);
|
407
|
-
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
408
|
-
sizeof(f_u32) * (new_capa - bv->capa));
|
409
|
-
bv->capa = new_capa;
|
410
|
-
}
|
411
|
-
}
|
412
|
-
|
413
424
|
static BitVector *bv_or_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
414
425
|
{
|
415
426
|
int i;
|
data/ext/{defines.h → config.h}
RENAMED
File without changes
|
data/ext/except.h
CHANGED
data/ext/extconf.rb
CHANGED
data/ext/fs_store.c
CHANGED
@@ -223,7 +223,8 @@ static off_t fs_length(Store *store, char *filename)
|
|
223
223
|
static void fso_flush_i(OutStream *os, uchar *src, int len)
|
224
224
|
{
|
225
225
|
if (len != write(os->file.fd, src, len)) {
|
226
|
-
RAISE(IO_ERROR, "flushing src of length %d", len
|
226
|
+
RAISE(IO_ERROR, "flushing src of length %d, <%s>", len,
|
227
|
+
strerror(errno));
|
227
228
|
}
|
228
229
|
}
|
229
230
|
|
@@ -268,7 +269,7 @@ static OutStream *fs_new_output(Store *store, const char *filename)
|
|
268
269
|
static void fsi_read_i(InStream *is, uchar *path, int len)
|
269
270
|
{
|
270
271
|
int fd = is->file.fd;
|
271
|
-
|
272
|
+
off_t pos = is_pos(is);
|
272
273
|
if (pos != lseek(fd, 0, SEEK_CUR)) {
|
273
274
|
lseek(fd, pos, SEEK_SET);
|
274
275
|
}
|
@@ -409,6 +410,7 @@ static HashTable stores = {
|
|
409
410
|
/* fill */ 0,
|
410
411
|
/* used */ 0,
|
411
412
|
/* mask */ HASH_MINSIZE - 1,
|
413
|
+
/* ref_cnt */ 1,
|
412
414
|
/* table */ stores.smalltable,
|
413
415
|
/* smalltable */ {{0, NULL, NULL}},
|
414
416
|
/* lookup */ (h_lookup_ft)&h_lookup_str,
|
data/ext/global.h
CHANGED
data/ext/hash.c
CHANGED
@@ -238,6 +238,7 @@ HashTable *h_new_str(free_ft free_key, free_ft free_value)
|
|
238
238
|
|
239
239
|
ht->free_key_i = free_key != NULL ? free_key : &dummy_free;
|
240
240
|
ht->free_value_i = free_value != NULL ? free_value : &dummy_free;
|
241
|
+
ht->ref_cnt = 1;
|
241
242
|
return ht;
|
242
243
|
}
|
243
244
|
|
@@ -285,23 +286,25 @@ void h_clear(HashTable *ht)
|
|
285
286
|
|
286
287
|
void h_destroy(HashTable *ht)
|
287
288
|
{
|
288
|
-
|
289
|
+
if (--(ht->ref_cnt) <= 0) {
|
290
|
+
h_clear(ht);
|
289
291
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
292
|
+
/* if a new table was created, be sure to free it */
|
293
|
+
if (ht->table != ht->smalltable) {
|
294
|
+
free(ht->table);
|
295
|
+
}
|
294
296
|
|
295
297
|
#ifdef DEBUG
|
296
|
-
free(ht);
|
297
|
-
#else
|
298
|
-
if (num_free_hts < MAX_FREE_HASH_TABLES) {
|
299
|
-
free_hts[num_free_hts++] = ht;
|
300
|
-
}
|
301
|
-
else {
|
302
298
|
free(ht);
|
303
|
-
|
299
|
+
#else
|
300
|
+
if (num_free_hts < MAX_FREE_HASH_TABLES) {
|
301
|
+
free_hts[num_free_hts++] = ht;
|
302
|
+
}
|
303
|
+
else {
|
304
|
+
free(ht);
|
305
|
+
}
|
304
306
|
#endif
|
307
|
+
}
|
305
308
|
}
|
306
309
|
|
307
310
|
void *h_get(HashTable *ht, const void *key)
|
data/ext/hash.h
CHANGED
@@ -46,6 +46,7 @@ typedef struct HashTable
|
|
46
46
|
int fill; /* num Active + num Dummy */
|
47
47
|
int size; /* num Active ie, num keys set */
|
48
48
|
int mask; /* capacity_of_table - 1 */
|
49
|
+
int ref_cnt;
|
49
50
|
|
50
51
|
/* table points to smalltable initially. If the table grows beyond 2/3 of
|
51
52
|
* HASH_MINSIZE it will point to newly malloced memory as it grows. */
|
data/ext/helper.c
CHANGED
@@ -14,13 +14,13 @@ f_i32 float2int(float f)
|
|
14
14
|
{
|
15
15
|
union { f_i32 i; float f; } tmp;
|
16
16
|
tmp.f = f;
|
17
|
-
return
|
17
|
+
return tmp.i;
|
18
18
|
}
|
19
19
|
|
20
20
|
float int2float(f_i32 i32)
|
21
21
|
{
|
22
22
|
union { f_i32 i; float f; } tmp;
|
23
|
-
tmp.i =
|
23
|
+
tmp.i = i32;
|
24
24
|
return tmp.f;
|
25
25
|
}
|
26
26
|
|
data/ext/helper.h
CHANGED
data/ext/index.c
CHANGED
@@ -5350,7 +5350,8 @@ void iw_close(IndexWriter *iw)
|
|
5350
5350
|
free(iw);
|
5351
5351
|
}
|
5352
5352
|
|
5353
|
-
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
5353
|
+
IndexWriter *iw_open(Store *store, volatile Analyzer *analyzer,
|
5354
|
+
const Config *config)
|
5354
5355
|
{
|
5355
5356
|
IndexWriter *iw = ALLOC_AND_ZERO(IndexWriter);
|
5356
5357
|
mutex_init(&iw->mutex, NULL);
|
@@ -5381,7 +5382,8 @@ IndexWriter *iw_open(Store *store, Analyzer *analyzer, const Config *config)
|
|
5381
5382
|
XENDTRY
|
5382
5383
|
|
5383
5384
|
iw->similarity = sim_create_default();
|
5384
|
-
iw->analyzer = analyzer ?
|
5385
|
+
iw->analyzer = analyzer ? (Analyzer *)analyzer
|
5386
|
+
: mb_standard_analyzer_new(true);
|
5385
5387
|
|
5386
5388
|
REF(store);
|
5387
5389
|
return iw;
|
data/ext/index.h
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "hash.h"
|
8
8
|
#include "hashset.h"
|
9
9
|
#include "store.h"
|
10
|
-
#include "
|
10
|
+
#include "mempool.h"
|
11
11
|
#include "similarity.h"
|
12
12
|
#include "bitvector.h"
|
13
13
|
#include "priorityqueue.h"
|
@@ -874,7 +874,7 @@ struct IndexWriter
|
|
874
874
|
};
|
875
875
|
|
876
876
|
extern void index_create(Store *store, FieldInfos *fis);
|
877
|
-
extern IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
877
|
+
extern IndexWriter *iw_open(Store *store, volatile Analyzer *analyzer,
|
878
878
|
const Config *config);
|
879
879
|
extern void iw_delete_term(IndexWriter *iw, const char *field,
|
880
880
|
const char *term);
|
File without changes
|
data/ext/multimapper.c
ADDED
@@ -0,0 +1,310 @@
|
|
1
|
+
#include "multimapper.h"
|
2
|
+
#include "array.h"
|
3
|
+
#include "bitvector.h"
|
4
|
+
#include <string.h>
|
5
|
+
|
6
|
+
#define St(state) ((State *)(state))
|
7
|
+
#define UCtoI(val) ((int)(unsigned char)(val))
|
8
|
+
|
9
|
+
static void state_destroy(State *state)
|
10
|
+
{
|
11
|
+
state->destroy_i(state);
|
12
|
+
}
|
13
|
+
|
14
|
+
typedef struct LetterState
|
15
|
+
{
|
16
|
+
State super;
|
17
|
+
int c;
|
18
|
+
int val;
|
19
|
+
char *mapping;
|
20
|
+
} LetterState;
|
21
|
+
#define LSt(state) ((LetterState *)(state))
|
22
|
+
|
23
|
+
|
24
|
+
static int lstate_next(LetterState *self, int c, int *states)
|
25
|
+
{
|
26
|
+
if (c == self->c) {
|
27
|
+
states[0] = self->val;
|
28
|
+
return 1;
|
29
|
+
}
|
30
|
+
else {
|
31
|
+
return 0;
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
static int lstate_is_match(LetterState *self, char **mapping)
|
36
|
+
{
|
37
|
+
if (self->val < 0) {
|
38
|
+
*mapping = self->mapping;
|
39
|
+
return self->val;
|
40
|
+
}
|
41
|
+
else {
|
42
|
+
return 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
static LetterState *lstate_new(int c, int val)
|
47
|
+
{
|
48
|
+
LetterState *self = ALLOC(LetterState);
|
49
|
+
self->c = c;
|
50
|
+
self->val = val;
|
51
|
+
self->mapping = NULL;
|
52
|
+
St(self)->next = (int (*)(State *, int, int *))&lstate_next;
|
53
|
+
St(self)->destroy_i = (void (*)(State *))&free;
|
54
|
+
St(self)->is_match = (int (*)(State *, char **))&lstate_is_match;
|
55
|
+
return self;
|
56
|
+
}
|
57
|
+
|
58
|
+
typedef struct NonDeterministicState
|
59
|
+
{
|
60
|
+
State super;
|
61
|
+
int *states[256];
|
62
|
+
int size[256];
|
63
|
+
int capa[256];
|
64
|
+
} NonDeterministicState;
|
65
|
+
|
66
|
+
static int ndstate_next(NonDeterministicState *self, int c, int *states)
|
67
|
+
{
|
68
|
+
int size = self->size[c];
|
69
|
+
memcpy(states, self->states[c], size * sizeof(int));
|
70
|
+
return size;
|
71
|
+
}
|
72
|
+
|
73
|
+
static void ndstate_add(NonDeterministicState *self, int c, int state)
|
74
|
+
{
|
75
|
+
if (self->capa[c] <= self->size[c]) {
|
76
|
+
if (self->capa[c] == 0) {
|
77
|
+
self->capa[c] = 4;
|
78
|
+
}
|
79
|
+
else {
|
80
|
+
self->capa[c] <<= 1;
|
81
|
+
}
|
82
|
+
REALLOC_N(self->states[c], int, self->capa[c]);
|
83
|
+
}
|
84
|
+
self->states[c][self->size[c]++] = state;
|
85
|
+
}
|
86
|
+
|
87
|
+
static void ndstate_destroy_i(NonDeterministicState *self)
|
88
|
+
{
|
89
|
+
int i;
|
90
|
+
for (i = 0; i < 256; i++) {
|
91
|
+
free(self->states[i]);
|
92
|
+
}
|
93
|
+
free(self);
|
94
|
+
}
|
95
|
+
|
96
|
+
static int ndstate_is_match(State *self, char **mapping)
|
97
|
+
{
|
98
|
+
(void)self; (void)mapping;
|
99
|
+
return 0;
|
100
|
+
}
|
101
|
+
|
102
|
+
static NonDeterministicState *ndstate_new()
|
103
|
+
{
|
104
|
+
NonDeterministicState *self = ALLOC_AND_ZERO(NonDeterministicState);
|
105
|
+
St(self)->next = (int (*)(State *, int, int *))&ndstate_next;
|
106
|
+
St(self)->destroy_i = (void (*)(State *))&ndstate_destroy_i;
|
107
|
+
St(self)->is_match = &ndstate_is_match;
|
108
|
+
return self;
|
109
|
+
}
|
110
|
+
|
111
|
+
MultiMapper *mulmap_new()
|
112
|
+
{
|
113
|
+
MultiMapper *self = ALLOC_AND_ZERO(MultiMapper);
|
114
|
+
self->capa = 128;
|
115
|
+
self->mappings = ALLOC_N(Mapping *, 128);
|
116
|
+
self->d_capa = 128;
|
117
|
+
self->dstates = ALLOC_N(DeterministicState *, 128);
|
118
|
+
self->dstates_map = NULL;
|
119
|
+
self->nstates = NULL;
|
120
|
+
self->ref_cnt = 1;
|
121
|
+
return self;
|
122
|
+
}
|
123
|
+
|
124
|
+
static __inline void mulmap_free_dstates(MultiMapper *self)
|
125
|
+
{
|
126
|
+
if (self->d_size > 0) {
|
127
|
+
int i;
|
128
|
+
for (i = self->d_size - 1; i >= 0; i--) {
|
129
|
+
free(self->dstates[i]);
|
130
|
+
}
|
131
|
+
self->d_size = 0;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
|
136
|
+
{
|
137
|
+
if (pattern == NULL || pattern[0] == '\0') {
|
138
|
+
RAISE(ARG_ERROR, "Tried to add empty pattern to multi_mapper");
|
139
|
+
}
|
140
|
+
else {
|
141
|
+
Mapping *mapping = ALLOC(Mapping);
|
142
|
+
if (self->size >= self->capa) {
|
143
|
+
self->capa <<= 1;
|
144
|
+
REALLOC_N(self->mappings, Mapping *, self->capa);
|
145
|
+
}
|
146
|
+
mapping->pattern = estrdup(pattern);
|
147
|
+
mapping->replacement = estrdup(rep);
|
148
|
+
self->mappings[self->size++] = mapping;
|
149
|
+
mulmap_free_dstates(self);
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
|
154
|
+
static __inline void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
155
|
+
{
|
156
|
+
int i;
|
157
|
+
for (i = cnt - 1; i >= 0; i--) {
|
158
|
+
bv_set(bv, states[i]);
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
|
163
|
+
{
|
164
|
+
DeterministicState *current_state = h_get(self->dstates_map, bv);
|
165
|
+
if (current_state == NULL) {
|
166
|
+
int bit, i;
|
167
|
+
int match_len = 0, max_match_len = 0;
|
168
|
+
State *start = self->nstates[0];
|
169
|
+
DeterministicState *start_ds;
|
170
|
+
current_state = ALLOC_AND_ZERO(DeterministicState);
|
171
|
+
h_set(self->dstates_map, bv, current_state);
|
172
|
+
if (self->d_size >= self->d_capa) {
|
173
|
+
self->d_capa <<= 1;
|
174
|
+
REALLOC_N(self->dstates, DeterministicState *, self->d_capa);
|
175
|
+
}
|
176
|
+
self->dstates[self->d_size++] = current_state;
|
177
|
+
start_ds = self->dstates[0];
|
178
|
+
for (i = 0; i <= 256; i++) {
|
179
|
+
current_state->next[i] = start_ds;
|
180
|
+
}
|
181
|
+
while ((bit = bv_scan_next(bv)) >= 0) {
|
182
|
+
char *mapping;
|
183
|
+
State *st = self->nstates[bit];
|
184
|
+
if ((match_len = -st->is_match(st, &mapping)) > max_match_len) {
|
185
|
+
current_state->longest_match = max_match_len = match_len;
|
186
|
+
current_state->mapping = mapping;
|
187
|
+
current_state->mapping_len = strlen(mapping);
|
188
|
+
}
|
189
|
+
}
|
190
|
+
for (i = self->a_size - 1; i >= 0; i--) {
|
191
|
+
unsigned char c = self->alphabet[i];
|
192
|
+
BitVector *nxt_bv = bv_new_capa(self->nsize);
|
193
|
+
mulmap_bv_set_states(nxt_bv, self->next_states,
|
194
|
+
start->next(start, (int)c, self->next_states));
|
195
|
+
bv_scan_reset(bv);
|
196
|
+
while ((bit = bv_scan_next(bv)) >= 0) {
|
197
|
+
State *state = self->nstates[bit];
|
198
|
+
mulmap_bv_set_states(nxt_bv, self->next_states,
|
199
|
+
state->next(state, (int)c, self->next_states));
|
200
|
+
}
|
201
|
+
current_state->next[(int)c] = mulmap_process_state(self, nxt_bv);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
else {
|
205
|
+
bv_destroy(bv);
|
206
|
+
}
|
207
|
+
return current_state;
|
208
|
+
}
|
209
|
+
|
210
|
+
void mulmap_compile(MultiMapper *self)
|
211
|
+
{
|
212
|
+
NonDeterministicState *start = ndstate_new();
|
213
|
+
int i, j;
|
214
|
+
int size = 1;
|
215
|
+
int capa = 128;
|
216
|
+
LetterState *ls;
|
217
|
+
State **nstates = ALLOC_N(State *, capa);
|
218
|
+
Mapping **mappings = self->mappings;
|
219
|
+
unsigned char alphabet[256];
|
220
|
+
nstates[0] = (State *)start;
|
221
|
+
memset(alphabet, 0, 256);
|
222
|
+
|
223
|
+
for (i = self->size - 1; i >= 0; i--) {
|
224
|
+
const char *pattern = mappings[i]->pattern;
|
225
|
+
const int plen = (int)strlen(pattern);
|
226
|
+
ndstate_add(start, UCtoI(pattern[0]), size);
|
227
|
+
if (size + plen + 1 >= capa) {
|
228
|
+
capa <<= 2;
|
229
|
+
REALLOC_N(nstates, State *, capa);
|
230
|
+
}
|
231
|
+
for (j = 0; j < plen; j++) {
|
232
|
+
alphabet[UCtoI(pattern[j])] = 1;
|
233
|
+
size += 1;
|
234
|
+
nstates[size-1] = (State *)lstate_new(UCtoI(pattern[j+1]), size);
|
235
|
+
}
|
236
|
+
ls = LSt(nstates[size-1]);
|
237
|
+
ls->mapping = mappings[i]->replacement;
|
238
|
+
ls->val = -plen;
|
239
|
+
ls->c = -1;
|
240
|
+
}
|
241
|
+
for (i = j = 0; i < 256; i++) {
|
242
|
+
if (alphabet[i]) self->alphabet[j++] = i;
|
243
|
+
}
|
244
|
+
self->a_size = j;
|
245
|
+
mulmap_free_dstates(self);
|
246
|
+
self->nstates = nstates;
|
247
|
+
self->nsize = size;
|
248
|
+
self->next_states = ALLOC_N(int, size);
|
249
|
+
self->dstates_map = h_new((hash_ft)&bv_hash, (eq_ft)&bv_eq,
|
250
|
+
(free_ft)&bv_destroy, (free_ft)NULL);
|
251
|
+
mulmap_process_state(self, bv_new_capa(0));
|
252
|
+
h_destroy(self->dstates_map);
|
253
|
+
for (i = size - 1; i >= 0; i--) {
|
254
|
+
state_destroy(nstates[i]);
|
255
|
+
}
|
256
|
+
free(self->next_states);
|
257
|
+
free(nstates);
|
258
|
+
}
|
259
|
+
|
260
|
+
int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
|
261
|
+
{
|
262
|
+
DeterministicState *start = self->dstates[0];
|
263
|
+
DeterministicState *state = start;
|
264
|
+
char *s = from, *d = to, *end = to + capa - 1;
|
265
|
+
if (self->d_size == 0) {
|
266
|
+
RAISE(STATE_ERROR, "You forgot to compile your MultiMapper");
|
267
|
+
}
|
268
|
+
while (*s && d < end) {
|
269
|
+
state = state->next[UCtoI(*s)];
|
270
|
+
if (state->mapping) {
|
271
|
+
int len = state->mapping_len;
|
272
|
+
d -= (state->longest_match - 1);
|
273
|
+
if ((d + len) > end) {
|
274
|
+
len = end - d;
|
275
|
+
}
|
276
|
+
memcpy(d, state->mapping, len);
|
277
|
+
d += len;
|
278
|
+
state = start;
|
279
|
+
}
|
280
|
+
else {
|
281
|
+
*(d++) = *s;
|
282
|
+
}
|
283
|
+
s++;
|
284
|
+
}
|
285
|
+
*d = '\0';
|
286
|
+
return d - to;
|
287
|
+
}
|
288
|
+
|
289
|
+
char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
|
290
|
+
{
|
291
|
+
mulmap_map_len(self, to, from, capa);
|
292
|
+
return to;
|
293
|
+
}
|
294
|
+
|
295
|
+
void mulmap_destroy(MultiMapper *self)
|
296
|
+
{
|
297
|
+
if (--(self->ref_cnt) <= 0) {
|
298
|
+
int i;
|
299
|
+
mulmap_free_dstates(self);
|
300
|
+
for (i = self->size - 1; i >= 0; i--) {
|
301
|
+
Mapping *mapping = self->mappings[i];
|
302
|
+
free(mapping->pattern);
|
303
|
+
free(mapping->replacement);
|
304
|
+
free(mapping);
|
305
|
+
}
|
306
|
+
free(self->mappings);
|
307
|
+
free(self->dstates);
|
308
|
+
free(self);
|
309
|
+
}
|
310
|
+
}
|