ferret 0.10.11 → 0.10.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/CHANGELOG
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
|
2
|
+
Fri Oct 13 09:18:31 JST 2006
|
3
|
+
* Changed documentation to state truthfully that FULL_ENGLISH_STOP_WORDS is
|
4
|
+
being used by default in StandardAnalyzer and StopwordFilter.
|
5
|
+
* Removed 'will', 's' and 't' from ENGLISH_STOP_WORDS so that all words in
|
6
|
+
ENGLISH_STOP_WORDS can be found in FULL_ENGLISH_STOP_WORDS, that is
|
7
|
+
ENGLISH_STOP_WORDS is a subset of FULL_ENGLISH_STOP_WORDS.
|
8
|
+
|
9
|
+
Thu Oct 12 23:04:19 JST 2006
|
10
|
+
* Fixed adding SortField to Sort object in Ruby. Garbage collection wasn't
|
11
|
+
working.
|
12
|
+
* Can now set :sort => SortField#new
|
13
|
+
|
1
14
|
Tue Oct 10 14:42:17 JST 2006
|
2
15
|
* Fixed MultiTermDocEnum bug introduced in version 0.10.10 during
|
3
16
|
performance enhancements.
|
data/Rakefile
CHANGED
@@ -41,7 +41,7 @@ SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
|
41
41
|
|
42
42
|
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles',
|
43
43
|
'.config', 'ext/cferret.c'])
|
44
|
-
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
44
|
+
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', 'ext/mem_pool.*', 'ext/defines.h', EXT_SRC_DEST)
|
45
45
|
POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
|
46
46
|
|
47
47
|
desc "Clean specifically for the release."
|
data/ext/analysis.c
CHANGED
@@ -1078,19 +1078,10 @@ static void sf_destroy_i(TokenStream *ts)
|
|
1078
1078
|
filter_destroy_i(ts);
|
1079
1079
|
}
|
1080
1080
|
|
1081
|
-
static void sf_clone_i_i(void *key, void *value, void *arg)
|
1082
|
-
{
|
1083
|
-
HashTable *word_table = (HashTable *)arg;
|
1084
|
-
char *word = estrdup(key);
|
1085
|
-
(void)value;
|
1086
|
-
h_set(word_table, word, word);
|
1087
|
-
}
|
1088
|
-
|
1089
1081
|
static TokenStream *sf_clone_i(TokenStream *orig_ts)
|
1090
1082
|
{
|
1091
|
-
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(
|
1092
|
-
StopFilt(new_ts)->words
|
1093
|
-
h_each(StopFilt(orig_ts)->words, &sf_clone_i_i, StopFilt(new_ts)->words);
|
1083
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
|
1084
|
+
REF(StopFilt(new_ts)->words);
|
1094
1085
|
return new_ts;
|
1095
1086
|
}
|
1096
1087
|
|
@@ -1157,6 +1148,66 @@ TokenStream *stop_filter_new(TokenStream *ts)
|
|
1157
1148
|
return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
|
1158
1149
|
}
|
1159
1150
|
|
1151
|
+
/****************************************************************************
|
1152
|
+
* MappingFilter
|
1153
|
+
****************************************************************************/
|
1154
|
+
|
1155
|
+
#define MFilt(filter) ((MappingFilter *)(filter))
|
1156
|
+
|
1157
|
+
static void mf_destroy_i(TokenStream *ts)
|
1158
|
+
{
|
1159
|
+
mulmap_destroy(MFilt(ts)->mapper);
|
1160
|
+
filter_destroy_i(ts);
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
static TokenStream *mf_clone_i(TokenStream *orig_ts)
|
1164
|
+
{
|
1165
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
|
1166
|
+
REF(MFilt(new_ts)->mapper);
|
1167
|
+
return new_ts;
|
1168
|
+
}
|
1169
|
+
|
1170
|
+
static Token *mf_next(TokenStream *ts)
|
1171
|
+
{
|
1172
|
+
char buf[MAX_WORD_SIZE];
|
1173
|
+
MultiMapper *mapper = MFilt(ts)->mapper;
|
1174
|
+
TokenFilter *tf = TkFilt(ts);
|
1175
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
1176
|
+
if (tk != NULL) {
|
1177
|
+
tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
|
1178
|
+
memcpy(tk->text, buf, tk->len + 1);
|
1179
|
+
}
|
1180
|
+
return tk;
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
static TokenStream *mf_reset(TokenStream *ts, char *text)
|
1184
|
+
{
|
1185
|
+
MultiMapper *mm = MFilt(ts)->mapper;
|
1186
|
+
if (mm->d_size == 0) {
|
1187
|
+
mulmap_compile(MFilt(ts)->mapper);
|
1188
|
+
}
|
1189
|
+
filter_reset(ts, text);
|
1190
|
+
return ts;
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
TokenStream *mapping_filter_new(TokenStream *sub_ts)
|
1194
|
+
{
|
1195
|
+
TokenStream *ts = tf_new(MappingFilter, sub_ts);
|
1196
|
+
MFilt(ts)->mapper = mulmap_new();
|
1197
|
+
ts->next = &mf_next;
|
1198
|
+
ts->destroy_i = &mf_destroy_i;
|
1199
|
+
ts->clone_i = &mf_clone_i;
|
1200
|
+
ts->reset = &mf_reset;
|
1201
|
+
return ts;
|
1202
|
+
}
|
1203
|
+
|
1204
|
+
TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
|
1205
|
+
const char *replacement)
|
1206
|
+
{
|
1207
|
+
mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
|
1208
|
+
return ts;
|
1209
|
+
}
|
1210
|
+
|
1160
1211
|
/****************************************************************************
|
1161
1212
|
* HyphenFilter
|
1162
1213
|
****************************************************************************/
|
data/ext/analysis.h
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "global.h"
|
5
5
|
#include "hash.h"
|
6
|
+
#include "multimapper.h"
|
6
7
|
#include <wchar.h>
|
7
8
|
|
8
9
|
/****************************************************************************
|
@@ -89,6 +90,12 @@ typedef struct StopFilter
|
|
89
90
|
HashTable *words;
|
90
91
|
} StopFilter;
|
91
92
|
|
93
|
+
typedef struct MappingFilter
|
94
|
+
{
|
95
|
+
TokenFilter super;
|
96
|
+
MultiMapper *mapper;
|
97
|
+
} MappingFilter;
|
98
|
+
|
92
99
|
typedef struct HyphenFilter
|
93
100
|
{
|
94
101
|
TokenFilter super;
|
@@ -150,6 +157,10 @@ extern TokenStream *stop_filter_new(TokenStream *ts);
|
|
150
157
|
extern TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
|
151
158
|
const char *charenc);
|
152
159
|
|
160
|
+
extern TokenStream *mapping_filter_new(TokenStream *ts);
|
161
|
+
extern TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
|
162
|
+
const char *replacement);
|
163
|
+
|
153
164
|
/****************************************************************************
|
154
165
|
*
|
155
166
|
* Analyzer
|
data/ext/bitvector.c
CHANGED
@@ -360,32 +360,53 @@ unsigned long bv_hash(BitVector *bv)
|
|
360
360
|
return hash;
|
361
361
|
}
|
362
362
|
|
363
|
+
static __inline void bv_recapa(BitVector *bv, int new_capa)
|
364
|
+
{
|
365
|
+
if (bv->capa < new_capa) {
|
366
|
+
REALLOC_N(bv->bits, f_u32, new_capa);
|
367
|
+
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
368
|
+
sizeof(f_u32) * (new_capa - bv->capa));
|
369
|
+
bv->capa = new_capa;
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
363
373
|
static BitVector *bv_and_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
364
374
|
{
|
365
375
|
int i;
|
366
|
-
int
|
367
|
-
int word_size
|
376
|
+
int size;
|
377
|
+
int word_size;
|
368
378
|
int capa = 4;
|
369
|
-
while (capa < word_size) {
|
370
|
-
capa <<= 1;
|
371
|
-
}
|
372
|
-
REALLOC_N(bv->bits, f_u32, capa);
|
373
|
-
bv->capa = capa;
|
374
|
-
bv->size = min_size;
|
375
379
|
|
376
380
|
if (bv1->extends_as_ones && bv2->extends_as_ones) {
|
381
|
+
size = max2(bv1->size, bv2->size);
|
377
382
|
bv->extends_as_ones = true;
|
378
383
|
}
|
384
|
+
else if (bv1->extends_as_ones || bv2->extends_as_ones) {
|
385
|
+
size = max2(bv1->size, bv2->size);
|
386
|
+
bv->extends_as_ones = false;
|
387
|
+
}
|
379
388
|
else {
|
389
|
+
size = min2(bv1->size, bv2->size);
|
380
390
|
bv->extends_as_ones = false;
|
381
391
|
}
|
382
392
|
|
393
|
+
word_size = (size >> 5) + 1;
|
394
|
+
while (capa < word_size) {
|
395
|
+
capa <<= 1;
|
396
|
+
}
|
397
|
+
bv_recapa(bv1, capa);
|
398
|
+
bv_recapa(bv2, capa);
|
399
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
400
|
+
bv->capa = capa;
|
401
|
+
bv->size = size;
|
402
|
+
|
383
403
|
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
384
404
|
sizeof(f_u32) * (capa - word_size));
|
385
405
|
|
386
406
|
for (i = 0; i < word_size; i++) {
|
387
407
|
bv->bits[i] = bv1->bits[i] & bv2->bits[i];
|
388
408
|
}
|
409
|
+
|
389
410
|
bv_recount(bv);
|
390
411
|
return bv;
|
391
412
|
}
|
@@ -400,16 +421,6 @@ BitVector *bv_and_x(BitVector *bv1, BitVector *bv2)
|
|
400
421
|
return bv_and_i(bv1, bv1, bv2);
|
401
422
|
}
|
402
423
|
|
403
|
-
static __inline void bv_recapa(BitVector *bv, int new_capa)
|
404
|
-
{
|
405
|
-
if (bv->capa < new_capa) {
|
406
|
-
REALLOC_N(bv->bits, f_u32, new_capa);
|
407
|
-
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
408
|
-
sizeof(f_u32) * (new_capa - bv->capa));
|
409
|
-
bv->capa = new_capa;
|
410
|
-
}
|
411
|
-
}
|
412
|
-
|
413
424
|
static BitVector *bv_or_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
414
425
|
{
|
415
426
|
int i;
|
data/ext/{defines.h → config.h}
RENAMED
File without changes
|
data/ext/except.h
CHANGED
data/ext/extconf.rb
CHANGED
data/ext/fs_store.c
CHANGED
@@ -223,7 +223,8 @@ static off_t fs_length(Store *store, char *filename)
|
|
223
223
|
static void fso_flush_i(OutStream *os, uchar *src, int len)
|
224
224
|
{
|
225
225
|
if (len != write(os->file.fd, src, len)) {
|
226
|
-
RAISE(IO_ERROR, "flushing src of length %d", len
|
226
|
+
RAISE(IO_ERROR, "flushing src of length %d, <%s>", len,
|
227
|
+
strerror(errno));
|
227
228
|
}
|
228
229
|
}
|
229
230
|
|
@@ -268,7 +269,7 @@ static OutStream *fs_new_output(Store *store, const char *filename)
|
|
268
269
|
static void fsi_read_i(InStream *is, uchar *path, int len)
|
269
270
|
{
|
270
271
|
int fd = is->file.fd;
|
271
|
-
|
272
|
+
off_t pos = is_pos(is);
|
272
273
|
if (pos != lseek(fd, 0, SEEK_CUR)) {
|
273
274
|
lseek(fd, pos, SEEK_SET);
|
274
275
|
}
|
@@ -409,6 +410,7 @@ static HashTable stores = {
|
|
409
410
|
/* fill */ 0,
|
410
411
|
/* used */ 0,
|
411
412
|
/* mask */ HASH_MINSIZE - 1,
|
413
|
+
/* ref_cnt */ 1,
|
412
414
|
/* table */ stores.smalltable,
|
413
415
|
/* smalltable */ {{0, NULL, NULL}},
|
414
416
|
/* lookup */ (h_lookup_ft)&h_lookup_str,
|
data/ext/global.h
CHANGED
data/ext/hash.c
CHANGED
@@ -238,6 +238,7 @@ HashTable *h_new_str(free_ft free_key, free_ft free_value)
|
|
238
238
|
|
239
239
|
ht->free_key_i = free_key != NULL ? free_key : &dummy_free;
|
240
240
|
ht->free_value_i = free_value != NULL ? free_value : &dummy_free;
|
241
|
+
ht->ref_cnt = 1;
|
241
242
|
return ht;
|
242
243
|
}
|
243
244
|
|
@@ -285,23 +286,25 @@ void h_clear(HashTable *ht)
|
|
285
286
|
|
286
287
|
void h_destroy(HashTable *ht)
|
287
288
|
{
|
288
|
-
|
289
|
+
if (--(ht->ref_cnt) <= 0) {
|
290
|
+
h_clear(ht);
|
289
291
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
292
|
+
/* if a new table was created, be sure to free it */
|
293
|
+
if (ht->table != ht->smalltable) {
|
294
|
+
free(ht->table);
|
295
|
+
}
|
294
296
|
|
295
297
|
#ifdef DEBUG
|
296
|
-
free(ht);
|
297
|
-
#else
|
298
|
-
if (num_free_hts < MAX_FREE_HASH_TABLES) {
|
299
|
-
free_hts[num_free_hts++] = ht;
|
300
|
-
}
|
301
|
-
else {
|
302
298
|
free(ht);
|
303
|
-
|
299
|
+
#else
|
300
|
+
if (num_free_hts < MAX_FREE_HASH_TABLES) {
|
301
|
+
free_hts[num_free_hts++] = ht;
|
302
|
+
}
|
303
|
+
else {
|
304
|
+
free(ht);
|
305
|
+
}
|
304
306
|
#endif
|
307
|
+
}
|
305
308
|
}
|
306
309
|
|
307
310
|
void *h_get(HashTable *ht, const void *key)
|
data/ext/hash.h
CHANGED
@@ -46,6 +46,7 @@ typedef struct HashTable
|
|
46
46
|
int fill; /* num Active + num Dummy */
|
47
47
|
int size; /* num Active ie, num keys set */
|
48
48
|
int mask; /* capacity_of_table - 1 */
|
49
|
+
int ref_cnt;
|
49
50
|
|
50
51
|
/* table points to smalltable initially. If the table grows beyond 2/3 of
|
51
52
|
* HASH_MINSIZE it will point to newly malloced memory as it grows. */
|
data/ext/helper.c
CHANGED
@@ -14,13 +14,13 @@ f_i32 float2int(float f)
|
|
14
14
|
{
|
15
15
|
union { f_i32 i; float f; } tmp;
|
16
16
|
tmp.f = f;
|
17
|
-
return
|
17
|
+
return tmp.i;
|
18
18
|
}
|
19
19
|
|
20
20
|
float int2float(f_i32 i32)
|
21
21
|
{
|
22
22
|
union { f_i32 i; float f; } tmp;
|
23
|
-
tmp.i =
|
23
|
+
tmp.i = i32;
|
24
24
|
return tmp.f;
|
25
25
|
}
|
26
26
|
|
data/ext/helper.h
CHANGED
data/ext/index.c
CHANGED
@@ -5350,7 +5350,8 @@ void iw_close(IndexWriter *iw)
|
|
5350
5350
|
free(iw);
|
5351
5351
|
}
|
5352
5352
|
|
5353
|
-
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
5353
|
+
IndexWriter *iw_open(Store *store, volatile Analyzer *analyzer,
|
5354
|
+
const Config *config)
|
5354
5355
|
{
|
5355
5356
|
IndexWriter *iw = ALLOC_AND_ZERO(IndexWriter);
|
5356
5357
|
mutex_init(&iw->mutex, NULL);
|
@@ -5381,7 +5382,8 @@ IndexWriter *iw_open(Store *store, Analyzer *analyzer, const Config *config)
|
|
5381
5382
|
XENDTRY
|
5382
5383
|
|
5383
5384
|
iw->similarity = sim_create_default();
|
5384
|
-
iw->analyzer = analyzer ?
|
5385
|
+
iw->analyzer = analyzer ? (Analyzer *)analyzer
|
5386
|
+
: mb_standard_analyzer_new(true);
|
5385
5387
|
|
5386
5388
|
REF(store);
|
5387
5389
|
return iw;
|
data/ext/index.h
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "hash.h"
|
8
8
|
#include "hashset.h"
|
9
9
|
#include "store.h"
|
10
|
-
#include "
|
10
|
+
#include "mempool.h"
|
11
11
|
#include "similarity.h"
|
12
12
|
#include "bitvector.h"
|
13
13
|
#include "priorityqueue.h"
|
@@ -874,7 +874,7 @@ struct IndexWriter
|
|
874
874
|
};
|
875
875
|
|
876
876
|
extern void index_create(Store *store, FieldInfos *fis);
|
877
|
-
extern IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
877
|
+
extern IndexWriter *iw_open(Store *store, volatile Analyzer *analyzer,
|
878
878
|
const Config *config);
|
879
879
|
extern void iw_delete_term(IndexWriter *iw, const char *field,
|
880
880
|
const char *term);
|
File without changes
|
data/ext/multimapper.c
ADDED
@@ -0,0 +1,310 @@
|
|
1
|
+
#include "multimapper.h"
|
2
|
+
#include "array.h"
|
3
|
+
#include "bitvector.h"
|
4
|
+
#include <string.h>
|
5
|
+
|
6
|
+
#define St(state) ((State *)(state))
|
7
|
+
#define UCtoI(val) ((int)(unsigned char)(val))
|
8
|
+
|
9
|
+
static void state_destroy(State *state)
|
10
|
+
{
|
11
|
+
state->destroy_i(state);
|
12
|
+
}
|
13
|
+
|
14
|
+
typedef struct LetterState
|
15
|
+
{
|
16
|
+
State super;
|
17
|
+
int c;
|
18
|
+
int val;
|
19
|
+
char *mapping;
|
20
|
+
} LetterState;
|
21
|
+
#define LSt(state) ((LetterState *)(state))
|
22
|
+
|
23
|
+
|
24
|
+
static int lstate_next(LetterState *self, int c, int *states)
|
25
|
+
{
|
26
|
+
if (c == self->c) {
|
27
|
+
states[0] = self->val;
|
28
|
+
return 1;
|
29
|
+
}
|
30
|
+
else {
|
31
|
+
return 0;
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
static int lstate_is_match(LetterState *self, char **mapping)
|
36
|
+
{
|
37
|
+
if (self->val < 0) {
|
38
|
+
*mapping = self->mapping;
|
39
|
+
return self->val;
|
40
|
+
}
|
41
|
+
else {
|
42
|
+
return 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
static LetterState *lstate_new(int c, int val)
|
47
|
+
{
|
48
|
+
LetterState *self = ALLOC(LetterState);
|
49
|
+
self->c = c;
|
50
|
+
self->val = val;
|
51
|
+
self->mapping = NULL;
|
52
|
+
St(self)->next = (int (*)(State *, int, int *))&lstate_next;
|
53
|
+
St(self)->destroy_i = (void (*)(State *))&free;
|
54
|
+
St(self)->is_match = (int (*)(State *, char **))&lstate_is_match;
|
55
|
+
return self;
|
56
|
+
}
|
57
|
+
|
58
|
+
typedef struct NonDeterministicState
|
59
|
+
{
|
60
|
+
State super;
|
61
|
+
int *states[256];
|
62
|
+
int size[256];
|
63
|
+
int capa[256];
|
64
|
+
} NonDeterministicState;
|
65
|
+
|
66
|
+
static int ndstate_next(NonDeterministicState *self, int c, int *states)
|
67
|
+
{
|
68
|
+
int size = self->size[c];
|
69
|
+
memcpy(states, self->states[c], size * sizeof(int));
|
70
|
+
return size;
|
71
|
+
}
|
72
|
+
|
73
|
+
static void ndstate_add(NonDeterministicState *self, int c, int state)
|
74
|
+
{
|
75
|
+
if (self->capa[c] <= self->size[c]) {
|
76
|
+
if (self->capa[c] == 0) {
|
77
|
+
self->capa[c] = 4;
|
78
|
+
}
|
79
|
+
else {
|
80
|
+
self->capa[c] <<= 1;
|
81
|
+
}
|
82
|
+
REALLOC_N(self->states[c], int, self->capa[c]);
|
83
|
+
}
|
84
|
+
self->states[c][self->size[c]++] = state;
|
85
|
+
}
|
86
|
+
|
87
|
+
static void ndstate_destroy_i(NonDeterministicState *self)
|
88
|
+
{
|
89
|
+
int i;
|
90
|
+
for (i = 0; i < 256; i++) {
|
91
|
+
free(self->states[i]);
|
92
|
+
}
|
93
|
+
free(self);
|
94
|
+
}
|
95
|
+
|
96
|
+
static int ndstate_is_match(State *self, char **mapping)
|
97
|
+
{
|
98
|
+
(void)self; (void)mapping;
|
99
|
+
return 0;
|
100
|
+
}
|
101
|
+
|
102
|
+
static NonDeterministicState *ndstate_new()
|
103
|
+
{
|
104
|
+
NonDeterministicState *self = ALLOC_AND_ZERO(NonDeterministicState);
|
105
|
+
St(self)->next = (int (*)(State *, int, int *))&ndstate_next;
|
106
|
+
St(self)->destroy_i = (void (*)(State *))&ndstate_destroy_i;
|
107
|
+
St(self)->is_match = &ndstate_is_match;
|
108
|
+
return self;
|
109
|
+
}
|
110
|
+
|
111
|
+
MultiMapper *mulmap_new()
|
112
|
+
{
|
113
|
+
MultiMapper *self = ALLOC_AND_ZERO(MultiMapper);
|
114
|
+
self->capa = 128;
|
115
|
+
self->mappings = ALLOC_N(Mapping *, 128);
|
116
|
+
self->d_capa = 128;
|
117
|
+
self->dstates = ALLOC_N(DeterministicState *, 128);
|
118
|
+
self->dstates_map = NULL;
|
119
|
+
self->nstates = NULL;
|
120
|
+
self->ref_cnt = 1;
|
121
|
+
return self;
|
122
|
+
}
|
123
|
+
|
124
|
+
static __inline void mulmap_free_dstates(MultiMapper *self)
|
125
|
+
{
|
126
|
+
if (self->d_size > 0) {
|
127
|
+
int i;
|
128
|
+
for (i = self->d_size - 1; i >= 0; i--) {
|
129
|
+
free(self->dstates[i]);
|
130
|
+
}
|
131
|
+
self->d_size = 0;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
|
136
|
+
{
|
137
|
+
if (pattern == NULL || pattern[0] == '\0') {
|
138
|
+
RAISE(ARG_ERROR, "Tried to add empty pattern to multi_mapper");
|
139
|
+
}
|
140
|
+
else {
|
141
|
+
Mapping *mapping = ALLOC(Mapping);
|
142
|
+
if (self->size >= self->capa) {
|
143
|
+
self->capa <<= 1;
|
144
|
+
REALLOC_N(self->mappings, Mapping *, self->capa);
|
145
|
+
}
|
146
|
+
mapping->pattern = estrdup(pattern);
|
147
|
+
mapping->replacement = estrdup(rep);
|
148
|
+
self->mappings[self->size++] = mapping;
|
149
|
+
mulmap_free_dstates(self);
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
|
154
|
+
static __inline void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
155
|
+
{
|
156
|
+
int i;
|
157
|
+
for (i = cnt - 1; i >= 0; i--) {
|
158
|
+
bv_set(bv, states[i]);
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
|
163
|
+
{
|
164
|
+
DeterministicState *current_state = h_get(self->dstates_map, bv);
|
165
|
+
if (current_state == NULL) {
|
166
|
+
int bit, i;
|
167
|
+
int match_len = 0, max_match_len = 0;
|
168
|
+
State *start = self->nstates[0];
|
169
|
+
DeterministicState *start_ds;
|
170
|
+
current_state = ALLOC_AND_ZERO(DeterministicState);
|
171
|
+
h_set(self->dstates_map, bv, current_state);
|
172
|
+
if (self->d_size >= self->d_capa) {
|
173
|
+
self->d_capa <<= 1;
|
174
|
+
REALLOC_N(self->dstates, DeterministicState *, self->d_capa);
|
175
|
+
}
|
176
|
+
self->dstates[self->d_size++] = current_state;
|
177
|
+
start_ds = self->dstates[0];
|
178
|
+
for (i = 0; i <= 256; i++) {
|
179
|
+
current_state->next[i] = start_ds;
|
180
|
+
}
|
181
|
+
while ((bit = bv_scan_next(bv)) >= 0) {
|
182
|
+
char *mapping;
|
183
|
+
State *st = self->nstates[bit];
|
184
|
+
if ((match_len = -st->is_match(st, &mapping)) > max_match_len) {
|
185
|
+
current_state->longest_match = max_match_len = match_len;
|
186
|
+
current_state->mapping = mapping;
|
187
|
+
current_state->mapping_len = strlen(mapping);
|
188
|
+
}
|
189
|
+
}
|
190
|
+
for (i = self->a_size - 1; i >= 0; i--) {
|
191
|
+
unsigned char c = self->alphabet[i];
|
192
|
+
BitVector *nxt_bv = bv_new_capa(self->nsize);
|
193
|
+
mulmap_bv_set_states(nxt_bv, self->next_states,
|
194
|
+
start->next(start, (int)c, self->next_states));
|
195
|
+
bv_scan_reset(bv);
|
196
|
+
while ((bit = bv_scan_next(bv)) >= 0) {
|
197
|
+
State *state = self->nstates[bit];
|
198
|
+
mulmap_bv_set_states(nxt_bv, self->next_states,
|
199
|
+
state->next(state, (int)c, self->next_states));
|
200
|
+
}
|
201
|
+
current_state->next[(int)c] = mulmap_process_state(self, nxt_bv);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
else {
|
205
|
+
bv_destroy(bv);
|
206
|
+
}
|
207
|
+
return current_state;
|
208
|
+
}
|
209
|
+
|
210
|
+
void mulmap_compile(MultiMapper *self)
|
211
|
+
{
|
212
|
+
NonDeterministicState *start = ndstate_new();
|
213
|
+
int i, j;
|
214
|
+
int size = 1;
|
215
|
+
int capa = 128;
|
216
|
+
LetterState *ls;
|
217
|
+
State **nstates = ALLOC_N(State *, capa);
|
218
|
+
Mapping **mappings = self->mappings;
|
219
|
+
unsigned char alphabet[256];
|
220
|
+
nstates[0] = (State *)start;
|
221
|
+
memset(alphabet, 0, 256);
|
222
|
+
|
223
|
+
for (i = self->size - 1; i >= 0; i--) {
|
224
|
+
const char *pattern = mappings[i]->pattern;
|
225
|
+
const int plen = (int)strlen(pattern);
|
226
|
+
ndstate_add(start, UCtoI(pattern[0]), size);
|
227
|
+
if (size + plen + 1 >= capa) {
|
228
|
+
capa <<= 2;
|
229
|
+
REALLOC_N(nstates, State *, capa);
|
230
|
+
}
|
231
|
+
for (j = 0; j < plen; j++) {
|
232
|
+
alphabet[UCtoI(pattern[j])] = 1;
|
233
|
+
size += 1;
|
234
|
+
nstates[size-1] = (State *)lstate_new(UCtoI(pattern[j+1]), size);
|
235
|
+
}
|
236
|
+
ls = LSt(nstates[size-1]);
|
237
|
+
ls->mapping = mappings[i]->replacement;
|
238
|
+
ls->val = -plen;
|
239
|
+
ls->c = -1;
|
240
|
+
}
|
241
|
+
for (i = j = 0; i < 256; i++) {
|
242
|
+
if (alphabet[i]) self->alphabet[j++] = i;
|
243
|
+
}
|
244
|
+
self->a_size = j;
|
245
|
+
mulmap_free_dstates(self);
|
246
|
+
self->nstates = nstates;
|
247
|
+
self->nsize = size;
|
248
|
+
self->next_states = ALLOC_N(int, size);
|
249
|
+
self->dstates_map = h_new((hash_ft)&bv_hash, (eq_ft)&bv_eq,
|
250
|
+
(free_ft)&bv_destroy, (free_ft)NULL);
|
251
|
+
mulmap_process_state(self, bv_new_capa(0));
|
252
|
+
h_destroy(self->dstates_map);
|
253
|
+
for (i = size - 1; i >= 0; i--) {
|
254
|
+
state_destroy(nstates[i]);
|
255
|
+
}
|
256
|
+
free(self->next_states);
|
257
|
+
free(nstates);
|
258
|
+
}
|
259
|
+
|
260
|
+
int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
|
261
|
+
{
|
262
|
+
DeterministicState *start = self->dstates[0];
|
263
|
+
DeterministicState *state = start;
|
264
|
+
char *s = from, *d = to, *end = to + capa - 1;
|
265
|
+
if (self->d_size == 0) {
|
266
|
+
RAISE(STATE_ERROR, "You forgot to compile your MultiMapper");
|
267
|
+
}
|
268
|
+
while (*s && d < end) {
|
269
|
+
state = state->next[UCtoI(*s)];
|
270
|
+
if (state->mapping) {
|
271
|
+
int len = state->mapping_len;
|
272
|
+
d -= (state->longest_match - 1);
|
273
|
+
if ((d + len) > end) {
|
274
|
+
len = end - d;
|
275
|
+
}
|
276
|
+
memcpy(d, state->mapping, len);
|
277
|
+
d += len;
|
278
|
+
state = start;
|
279
|
+
}
|
280
|
+
else {
|
281
|
+
*(d++) = *s;
|
282
|
+
}
|
283
|
+
s++;
|
284
|
+
}
|
285
|
+
*d = '\0';
|
286
|
+
return d - to;
|
287
|
+
}
|
288
|
+
|
289
|
+
char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
|
290
|
+
{
|
291
|
+
mulmap_map_len(self, to, from, capa);
|
292
|
+
return to;
|
293
|
+
}
|
294
|
+
|
295
|
+
void mulmap_destroy(MultiMapper *self)
|
296
|
+
{
|
297
|
+
if (--(self->ref_cnt) <= 0) {
|
298
|
+
int i;
|
299
|
+
mulmap_free_dstates(self);
|
300
|
+
for (i = self->size - 1; i >= 0; i--) {
|
301
|
+
Mapping *mapping = self->mappings[i];
|
302
|
+
free(mapping->pattern);
|
303
|
+
free(mapping->replacement);
|
304
|
+
free(mapping);
|
305
|
+
}
|
306
|
+
free(self->mappings);
|
307
|
+
free(self->dstates);
|
308
|
+
free(self);
|
309
|
+
}
|
310
|
+
}
|