ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/ext/lang.c
ADDED
data/ext/lang.h
CHANGED
@@ -10,39 +10,59 @@
|
|
10
10
|
#undef rename
|
11
11
|
#undef read
|
12
12
|
|
13
|
-
#define
|
14
|
-
#define
|
15
|
-
#define
|
13
|
+
#define frt_emalloc xmalloc
|
14
|
+
#define frt_ecalloc(n) xcalloc(n, 1)
|
15
|
+
#define frt_erealloc xrealloc
|
16
|
+
/* FIXME: should eventually delete this */
|
17
|
+
#define FRT_REALLOC_N REALLOC_N
|
16
18
|
|
17
19
|
|
18
20
|
#ifdef FRT_HAS_ISO_VARARGS
|
19
21
|
/* C99-compliant compiler */
|
20
22
|
|
21
|
-
# define
|
22
|
-
extern void
|
23
|
+
# define FRT_XEXIT(...) frb_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
|
24
|
+
extern void frb_rb_raise(const char *file, int line_num, const char *func,
|
23
25
|
const char *err_type, const char *fmt, ...);
|
24
26
|
|
25
|
-
# define
|
26
|
-
|
27
|
-
extern void
|
27
|
+
# define FRT_VEXIT(err_type, fmt, args) \
|
28
|
+
frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
|
29
|
+
extern void frb_vrb_raise(const char *file, int line_num, const char *func,
|
28
30
|
const char *err_type, const char *fmt, va_list args);
|
29
31
|
|
30
32
|
#elif defined(FRT_HAS_GNUC_VARARGS)
|
31
33
|
/* gcc has an extension */
|
32
34
|
|
33
|
-
# define
|
34
|
-
extern void
|
35
|
+
# define FRT_XEXIT(args...) frb_rb_raise(__FILE__, __LINE__, __func__, ##args)
|
36
|
+
extern void frb_rb_raise(const char *file, int line_num, const char *func,
|
35
37
|
const char *err_type, const char *fmt, ...);
|
36
38
|
|
37
|
-
# define
|
38
|
-
|
39
|
-
extern void
|
39
|
+
# define FRT_VEXIT(err_type, fmt, args) \
|
40
|
+
frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
|
41
|
+
extern void frb_vrb_raise(const char *file, int line_num, const char *func,
|
40
42
|
const char *err_type, const char *fmt, va_list args);
|
41
43
|
#else
|
42
44
|
/* Can't do VARARGS */
|
43
45
|
|
44
|
-
extern void
|
45
|
-
extern void
|
46
|
+
extern void FRT_XEXIT(const char *err_type, const char *fmt, ...);
|
47
|
+
extern void FRT_VEXIT(const char *err_type, const char *fmt, va_list args);
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#ifdef RUBY_RUBY_H
|
51
|
+
# define FRT_RUBY_VERSION_1_9
|
52
|
+
#endif
|
53
|
+
|
54
|
+
// ruby 1.8 compat with 1.9 to avoid ifdefs
|
55
|
+
#if !defined RSTRING_LEN
|
56
|
+
#define RSTRING_LEN(a) RSTRING(a)->len
|
57
|
+
#endif
|
58
|
+
#if !defined RSTRING_PTR
|
59
|
+
#define RSTRING_PTR(a) RSTRING(a)->ptr
|
60
|
+
#endif
|
61
|
+
#if !defined RARRAY_LEN
|
62
|
+
#define RARRAY_LEN(a) RARRAY(a)->len
|
63
|
+
#endif
|
64
|
+
#if !defined RARRAY_PTR
|
65
|
+
#define RARRAY_PTR(a) RARRAY(a)->ptr
|
46
66
|
#endif
|
47
67
|
|
48
68
|
#endif
|
data/ext/mempool.c
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#include "global.h"
|
2
2
|
#include "mempool.h"
|
3
3
|
#include <string.h>
|
4
|
+
#include "internal.h"
|
4
5
|
|
5
6
|
MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
|
6
7
|
{
|
@@ -9,7 +10,7 @@ MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
|
|
9
10
|
mp->buf_capa = init_buf_capa;
|
10
11
|
mp->buffers = ALLOC_N(char *, init_buf_capa);
|
11
12
|
|
12
|
-
mp->buffers[0] = mp->curr_buffer = emalloc(mp->chunk_size);
|
13
|
+
mp->buffers[0] = mp->curr_buffer = (char *)emalloc(mp->chunk_size);
|
13
14
|
mp->buf_alloc = 1;
|
14
15
|
mp->buf_pointer = 0;
|
15
16
|
mp->pointer = 0;
|
@@ -38,7 +39,7 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
|
|
38
39
|
mp->buf_capa <<= 1;
|
39
40
|
REALLOC_N(mp->buffers, char *, mp->buf_capa);
|
40
41
|
}
|
41
|
-
mp->buffers[mp->buf_pointer] = emalloc(mp->chunk_size);
|
42
|
+
mp->buffers[mp->buf_pointer] = (char *)emalloc(mp->chunk_size);
|
42
43
|
}
|
43
44
|
p = mp->curr_buffer = mp->buffers[mp->buf_pointer];
|
44
45
|
mp->pointer = size;
|
@@ -49,12 +50,12 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
|
|
49
50
|
char *mp_strdup(MemoryPool *mp, const char *str)
|
50
51
|
{
|
51
52
|
int len = strlen(str) + 1;
|
52
|
-
return memcpy(mp_alloc(mp, len), str, len);
|
53
|
+
return (char *)memcpy(mp_alloc(mp, len), str, len);
|
53
54
|
}
|
54
55
|
|
55
56
|
char *mp_strndup(MemoryPool *mp, const char *str, int len)
|
56
57
|
{
|
57
|
-
char *s = memcpy(mp_alloc(mp, len + 1), str, len);
|
58
|
+
char *s = (char *)memcpy(mp_alloc(mp, len + 1), str, len);
|
58
59
|
s[len] = '\0';
|
59
60
|
return s;
|
60
61
|
}
|
data/ext/mempool.h
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
#ifndef FRT_MEM_POOL_H
|
2
2
|
#define FRT_MEM_POOL_H
|
3
3
|
|
4
|
-
#
|
5
|
-
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#define FRT_MP_BUF_SIZE 65536
|
9
|
+
#define FRT_MP_INIT_CAPA 4
|
6
10
|
|
7
|
-
typedef struct
|
11
|
+
typedef struct FrtMemoryPool {
|
8
12
|
int buf_alloc;
|
9
13
|
int buf_capa;
|
10
14
|
int buf_pointer;
|
@@ -12,24 +16,28 @@ typedef struct MemoryPool {
|
|
12
16
|
int chunk_size;
|
13
17
|
char *curr_buffer;
|
14
18
|
char **buffers;
|
15
|
-
}
|
16
|
-
|
17
|
-
extern
|
18
|
-
extern
|
19
|
-
extern
|
20
|
-
extern void
|
21
|
-
extern void
|
22
|
-
extern char *
|
23
|
-
extern char *
|
24
|
-
extern void *
|
25
|
-
extern int
|
26
|
-
|
27
|
-
#define
|
28
|
-
#define
|
29
|
-
|
30
|
-
#define
|
31
|
-
(type*)memset(
|
32
|
-
#define
|
33
|
-
(type*)
|
19
|
+
} FrtMemoryPool;
|
20
|
+
|
21
|
+
extern FrtMemoryPool *frt_mp_new();
|
22
|
+
extern FrtMemoryPool *frt_mp_new_capa(int chunk_size, int init_capa);
|
23
|
+
extern FRT_INLINE void *frt_mp_alloc(FrtMemoryPool *mp, int size);
|
24
|
+
extern void frt_mp_reset(FrtMemoryPool *mp);
|
25
|
+
extern void frt_mp_destroy(FrtMemoryPool *mp);
|
26
|
+
extern char *frt_mp_strdup(FrtMemoryPool *mp, const char *str);
|
27
|
+
extern char *frt_mp_strndup(FrtMemoryPool *mp, const char *str, int len);
|
28
|
+
extern void *frt_mp_memdup(FrtMemoryPool *mp, const void *p, int len);
|
29
|
+
extern int frt_mp_used(FrtMemoryPool *mp);
|
30
|
+
|
31
|
+
#define FRT_MP_ALLOC_N(mp,type,n) (type *)frt_mp_alloc(mp, sizeof(type)*(n))
|
32
|
+
#define FRT_MP_ALLOC(mp,type) (type *)frt_mp_alloc(mp, sizeof(type))
|
33
|
+
|
34
|
+
#define FRT_MP_ALLOC_AND_ZERO(mp,type)\
|
35
|
+
(type*)memset(frt_mp_alloc(mp, sizeof(type)), 0, sizeof(type))
|
36
|
+
#define FRT_MP_ALLOC_AND_ZERO_N(mp,type,n)\
|
37
|
+
(type*)FRT_ZEROSET_N(frt_mp_alloc(mp, sizeof(type)*(n)), type, n)
|
38
|
+
|
39
|
+
#ifdef __cplusplus
|
40
|
+
} // extern "C"
|
41
|
+
#endif
|
34
42
|
|
35
43
|
#endif
|
data/ext/modules.h
CHANGED
@@ -4,7 +4,8 @@
|
|
4
4
|
* Do not edit manually.
|
5
5
|
*
|
6
6
|
* Modules included by this file are: danish, dutch, english, finnish, french,
|
7
|
-
* german, italian, norwegian, porter, portuguese,
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
8
|
+
* russian, spanish, swedish, turkish
|
8
9
|
*/
|
9
10
|
|
10
11
|
#include "stem_ISO_8859_1_danish.h"
|
@@ -19,6 +20,8 @@
|
|
19
20
|
#include "stem_UTF_8_french.h"
|
20
21
|
#include "stem_ISO_8859_1_german.h"
|
21
22
|
#include "stem_UTF_8_german.h"
|
23
|
+
#include "stem_ISO_8859_1_hungarian.h"
|
24
|
+
#include "stem_UTF_8_hungarian.h"
|
22
25
|
#include "stem_ISO_8859_1_italian.h"
|
23
26
|
#include "stem_UTF_8_italian.h"
|
24
27
|
#include "stem_ISO_8859_1_norwegian.h"
|
@@ -27,34 +30,39 @@
|
|
27
30
|
#include "stem_UTF_8_porter.h"
|
28
31
|
#include "stem_ISO_8859_1_portuguese.h"
|
29
32
|
#include "stem_UTF_8_portuguese.h"
|
33
|
+
#include "stem_ISO_8859_2_romanian.h"
|
34
|
+
#include "stem_UTF_8_romanian.h"
|
30
35
|
#include "stem_KOI8_R_russian.h"
|
31
36
|
#include "stem_UTF_8_russian.h"
|
32
37
|
#include "stem_ISO_8859_1_spanish.h"
|
33
38
|
#include "stem_UTF_8_spanish.h"
|
34
39
|
#include "stem_ISO_8859_1_swedish.h"
|
35
40
|
#include "stem_UTF_8_swedish.h"
|
41
|
+
#include "stem_UTF_8_turkish.h"
|
36
42
|
|
37
43
|
typedef enum {
|
38
|
-
ENC_UNKNOWN,
|
44
|
+
ENC_UNKNOWN=0,
|
39
45
|
ENC_ISO_8859_1,
|
46
|
+
ENC_ISO_8859_2,
|
40
47
|
ENC_KOI8_R,
|
41
48
|
ENC_UTF_8
|
42
|
-
}
|
49
|
+
} stemmer_encoding_t;
|
43
50
|
|
44
51
|
struct stemmer_encoding {
|
45
52
|
const char * name;
|
46
|
-
|
53
|
+
stemmer_encoding_t enc;
|
47
54
|
};
|
48
55
|
static struct stemmer_encoding encodings[] = {
|
49
56
|
{"ISO_8859_1", ENC_ISO_8859_1},
|
57
|
+
{"ISO_8859_2", ENC_ISO_8859_2},
|
50
58
|
{"KOI8_R", ENC_KOI8_R},
|
51
59
|
{"UTF_8", ENC_UTF_8},
|
52
|
-
{0,
|
60
|
+
{0,ENC_UNKNOWN}
|
53
61
|
};
|
54
62
|
|
55
63
|
struct stemmer_modules {
|
56
64
|
const char * name;
|
57
|
-
|
65
|
+
stemmer_encoding_t enc;
|
58
66
|
struct SN_env * (*create)(void);
|
59
67
|
void (*close)(struct SN_env *);
|
60
68
|
int (*stem)(struct SN_env *);
|
@@ -102,6 +110,12 @@ static struct stemmer_modules modules[] = {
|
|
102
110
|
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
103
111
|
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
104
112
|
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
113
|
+
{"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
114
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
115
|
+
{"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
116
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
117
|
+
{"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
118
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
105
119
|
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
106
120
|
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
107
121
|
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
@@ -126,8 +140,16 @@ static struct stemmer_modules modules[] = {
|
|
126
140
|
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
127
141
|
{"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
128
142
|
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
143
|
+
{"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
144
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
145
|
+
{"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
146
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
147
|
+
{"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
148
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
129
149
|
{"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
130
150
|
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
151
|
+
{"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
152
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
131
153
|
{"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
132
154
|
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
133
155
|
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
@@ -142,7 +164,10 @@ static struct stemmer_modules modules[] = {
|
|
142
164
|
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
143
165
|
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
144
166
|
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
145
|
-
{
|
167
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
168
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
169
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
170
|
+
{0,ENC_UNKNOWN,0,0,0}
|
146
171
|
};
|
147
172
|
static const char * algorithm_names[] = {
|
148
173
|
"danish",
|
@@ -151,12 +176,15 @@ static const char * algorithm_names[] = {
|
|
151
176
|
"finnish",
|
152
177
|
"french",
|
153
178
|
"german",
|
179
|
+
"hungarian",
|
154
180
|
"italian",
|
155
181
|
"norwegian",
|
156
182
|
"porter",
|
157
183
|
"portuguese",
|
184
|
+
"romanian",
|
158
185
|
"russian",
|
159
186
|
"spanish",
|
160
187
|
"swedish",
|
188
|
+
"turkish",
|
161
189
|
0
|
162
190
|
};
|
data/ext/multimapper.c
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
#include "array.h"
|
3
3
|
#include "bitvector.h"
|
4
4
|
#include <string.h>
|
5
|
+
#include "internal.h"
|
5
6
|
|
6
7
|
#define St(state) ((State *)(state))
|
7
8
|
#define UCtoI(val) ((int)(unsigned char)(val))
|
@@ -161,7 +162,8 @@ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
|
161
162
|
|
162
163
|
static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
|
163
164
|
{
|
164
|
-
DeterministicState *current_state
|
165
|
+
DeterministicState *current_state
|
166
|
+
= (DeterministicState *)h_get(self->dstates_map, bv);
|
165
167
|
if (current_state == NULL) {
|
166
168
|
int bit, i;
|
167
169
|
int match_len = 0, max_match_len = 0;
|
@@ -263,7 +265,7 @@ int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
|
|
263
265
|
DeterministicState *state = start;
|
264
266
|
char *s = from, *d = to, *end = to + capa - 1;
|
265
267
|
if (self->d_size == 0) {
|
266
|
-
|
268
|
+
mulmap_compile(self);
|
267
269
|
}
|
268
270
|
while (*s && d < end) {
|
269
271
|
state = state->next[UCtoI(*s)];
|
@@ -292,6 +294,45 @@ char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
|
|
292
294
|
return to;
|
293
295
|
}
|
294
296
|
|
297
|
+
/* Maps a string to a dynamically allocated string */
|
298
|
+
char *mulmap_dynamic_map(MultiMapper *self, char *from)
|
299
|
+
{
|
300
|
+
DeterministicState *start = self->dstates[0];
|
301
|
+
DeterministicState *state = start;
|
302
|
+
int capa = strlen(from);
|
303
|
+
char *to = (char *)ecalloc(capa);
|
304
|
+
char *s = from, *d = to, *end = to + capa - 1;
|
305
|
+
if (self->d_size == 0) {
|
306
|
+
mulmap_compile(self);
|
307
|
+
}
|
308
|
+
do {
|
309
|
+
while (*s && d < end) {
|
310
|
+
state = state->next[UCtoI(*s)];
|
311
|
+
if (state->mapping) {
|
312
|
+
int len = state->mapping_len;
|
313
|
+
d -= (state->longest_match - 1);
|
314
|
+
if ((d + len) > end) {
|
315
|
+
len = end - d;
|
316
|
+
}
|
317
|
+
memcpy(d, state->mapping, len);
|
318
|
+
d += len;
|
319
|
+
state = start;
|
320
|
+
}
|
321
|
+
else {
|
322
|
+
*(d++) = *s;
|
323
|
+
}
|
324
|
+
s++;
|
325
|
+
}
|
326
|
+
if (*s) {
|
327
|
+
capa += 1024;
|
328
|
+
erealloc(to, capa);
|
329
|
+
end = to + capa - 1;
|
330
|
+
}
|
331
|
+
} while(*s);
|
332
|
+
*d = '\0';
|
333
|
+
return to;
|
334
|
+
}
|
335
|
+
|
295
336
|
void mulmap_destroy(MultiMapper *self)
|
296
337
|
{
|
297
338
|
if (--(self->ref_cnt) <= 0) {
|
data/ext/multimapper.h
CHANGED
@@ -1,51 +1,60 @@
|
|
1
1
|
#ifndef FRT_MAPPER_H
|
2
2
|
#define FRT_MAPPER_H
|
3
3
|
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
4
8
|
#include "hash.h"
|
5
9
|
|
6
|
-
typedef struct
|
10
|
+
typedef struct FrtState
|
7
11
|
{
|
8
|
-
int (*next)(struct
|
9
|
-
void (*destroy_i)(struct
|
10
|
-
int (*is_match)(struct
|
11
|
-
}
|
12
|
+
int (*next)(struct FrtState *self, int c, int *states);
|
13
|
+
void (*destroy_i)(struct FrtState *self);
|
14
|
+
int (*is_match)(struct FrtState *self, char **mapping);
|
15
|
+
} FrtState;
|
12
16
|
|
13
|
-
typedef struct
|
17
|
+
typedef struct FrtDeterministicState
|
14
18
|
{
|
15
|
-
struct
|
19
|
+
struct FrtDeterministicState *next[256];
|
16
20
|
int longest_match;
|
17
21
|
char *mapping;
|
18
22
|
int mapping_len;
|
19
|
-
}
|
23
|
+
} FrtDeterministicState;
|
20
24
|
|
21
|
-
typedef struct
|
25
|
+
typedef struct FrtMapping
|
22
26
|
{
|
23
27
|
char *pattern;
|
24
28
|
char *replacement;
|
25
|
-
}
|
29
|
+
} FrtMapping;
|
26
30
|
|
27
|
-
typedef struct
|
31
|
+
typedef struct FrtMultiMapper
|
28
32
|
{
|
29
|
-
|
33
|
+
FrtMapping **mappings;
|
30
34
|
int size;
|
31
35
|
int capa;
|
32
|
-
|
36
|
+
FrtDeterministicState **dstates;
|
33
37
|
int d_size;
|
34
38
|
int d_capa;
|
35
39
|
unsigned char alphabet[256];
|
36
40
|
int a_size;
|
37
|
-
|
38
|
-
|
41
|
+
FrtHash *dstates_map;
|
42
|
+
FrtState **nstates;
|
39
43
|
int nsize;
|
40
44
|
int *next_states;
|
41
45
|
int ref_cnt;
|
42
|
-
}
|
43
|
-
|
44
|
-
extern
|
45
|
-
extern void
|
46
|
-
extern void
|
47
|
-
extern char *
|
48
|
-
extern
|
49
|
-
extern
|
46
|
+
} FrtMultiMapper;
|
47
|
+
|
48
|
+
extern FrtMultiMapper *frt_mulmap_new();
|
49
|
+
extern void frt_mulmap_add_mapping(FrtMultiMapper *self, const char *p, const char *r);
|
50
|
+
extern void frt_mulmap_compile(FrtMultiMapper *self);
|
51
|
+
extern char *frt_mulmap_map(FrtMultiMapper *self, char *to, char *from, int capa);
|
52
|
+
extern char *frt_mulmap_dynamic_map(FrtMultiMapper *self, char *from);
|
53
|
+
extern int frt_mulmap_map_len(FrtMultiMapper *self, char *to, char *from, int capa);
|
54
|
+
extern void frt_mulmap_destroy(FrtMultiMapper *self);
|
55
|
+
|
56
|
+
#ifdef __cplusplus
|
57
|
+
} // extern "C"
|
58
|
+
#endif
|
50
59
|
|
51
60
|
#endif
|