ferret 0.11.6 → 0.11.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/ext/lang.c
ADDED
data/ext/lang.h
CHANGED
@@ -10,39 +10,59 @@
|
|
10
10
|
#undef rename
|
11
11
|
#undef read
|
12
12
|
|
13
|
-
#define
|
14
|
-
#define
|
15
|
-
#define
|
13
|
+
#define frt_emalloc xmalloc
|
14
|
+
#define frt_ecalloc(n) xcalloc(n, 1)
|
15
|
+
#define frt_erealloc xrealloc
|
16
|
+
/* FIXME: should eventually delete this */
|
17
|
+
#define FRT_REALLOC_N REALLOC_N
|
16
18
|
|
17
19
|
|
18
20
|
#ifdef FRT_HAS_ISO_VARARGS
|
19
21
|
/* C99-compliant compiler */
|
20
22
|
|
21
|
-
# define
|
22
|
-
extern void
|
23
|
+
# define FRT_XEXIT(...) frb_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
|
24
|
+
extern void frb_rb_raise(const char *file, int line_num, const char *func,
|
23
25
|
const char *err_type, const char *fmt, ...);
|
24
26
|
|
25
|
-
# define
|
26
|
-
|
27
|
-
extern void
|
27
|
+
# define FRT_VEXIT(err_type, fmt, args) \
|
28
|
+
frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
|
29
|
+
extern void frb_vrb_raise(const char *file, int line_num, const char *func,
|
28
30
|
const char *err_type, const char *fmt, va_list args);
|
29
31
|
|
30
32
|
#elif defined(FRT_HAS_GNUC_VARARGS)
|
31
33
|
/* gcc has an extension */
|
32
34
|
|
33
|
-
# define
|
34
|
-
extern void
|
35
|
+
# define FRT_XEXIT(args...) frb_rb_raise(__FILE__, __LINE__, __func__, ##args)
|
36
|
+
extern void frb_rb_raise(const char *file, int line_num, const char *func,
|
35
37
|
const char *err_type, const char *fmt, ...);
|
36
38
|
|
37
|
-
# define
|
38
|
-
|
39
|
-
extern void
|
39
|
+
# define FRT_VEXIT(err_type, fmt, args) \
|
40
|
+
frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
|
41
|
+
extern void frb_vrb_raise(const char *file, int line_num, const char *func,
|
40
42
|
const char *err_type, const char *fmt, va_list args);
|
41
43
|
#else
|
42
44
|
/* Can't do VARARGS */
|
43
45
|
|
44
|
-
extern void
|
45
|
-
extern void
|
46
|
+
extern void FRT_XEXIT(const char *err_type, const char *fmt, ...);
|
47
|
+
extern void FRT_VEXIT(const char *err_type, const char *fmt, va_list args);
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#ifdef RUBY_RUBY_H
|
51
|
+
# define FRT_RUBY_VERSION_1_9
|
52
|
+
#endif
|
53
|
+
|
54
|
+
// ruby 1.8 compat with 1.9 to avoid ifdefs
|
55
|
+
#if !defined RSTRING_LEN
|
56
|
+
#define RSTRING_LEN(a) RSTRING(a)->len
|
57
|
+
#endif
|
58
|
+
#if !defined RSTRING_PTR
|
59
|
+
#define RSTRING_PTR(a) RSTRING(a)->ptr
|
60
|
+
#endif
|
61
|
+
#if !defined RARRAY_LEN
|
62
|
+
#define RARRAY_LEN(a) RARRAY(a)->len
|
63
|
+
#endif
|
64
|
+
#if !defined RARRAY_PTR
|
65
|
+
#define RARRAY_PTR(a) RARRAY(a)->ptr
|
46
66
|
#endif
|
47
67
|
|
48
68
|
#endif
|
data/ext/mempool.c
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#include "global.h"
|
2
2
|
#include "mempool.h"
|
3
3
|
#include <string.h>
|
4
|
+
#include "internal.h"
|
4
5
|
|
5
6
|
MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
|
6
7
|
{
|
@@ -9,7 +10,7 @@ MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
|
|
9
10
|
mp->buf_capa = init_buf_capa;
|
10
11
|
mp->buffers = ALLOC_N(char *, init_buf_capa);
|
11
12
|
|
12
|
-
mp->buffers[0] = mp->curr_buffer = emalloc(mp->chunk_size);
|
13
|
+
mp->buffers[0] = mp->curr_buffer = (char *)emalloc(mp->chunk_size);
|
13
14
|
mp->buf_alloc = 1;
|
14
15
|
mp->buf_pointer = 0;
|
15
16
|
mp->pointer = 0;
|
@@ -38,7 +39,7 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
|
|
38
39
|
mp->buf_capa <<= 1;
|
39
40
|
REALLOC_N(mp->buffers, char *, mp->buf_capa);
|
40
41
|
}
|
41
|
-
mp->buffers[mp->buf_pointer] = emalloc(mp->chunk_size);
|
42
|
+
mp->buffers[mp->buf_pointer] = (char *)emalloc(mp->chunk_size);
|
42
43
|
}
|
43
44
|
p = mp->curr_buffer = mp->buffers[mp->buf_pointer];
|
44
45
|
mp->pointer = size;
|
@@ -49,12 +50,12 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
|
|
49
50
|
char *mp_strdup(MemoryPool *mp, const char *str)
|
50
51
|
{
|
51
52
|
int len = strlen(str) + 1;
|
52
|
-
return memcpy(mp_alloc(mp, len), str, len);
|
53
|
+
return (char *)memcpy(mp_alloc(mp, len), str, len);
|
53
54
|
}
|
54
55
|
|
55
56
|
char *mp_strndup(MemoryPool *mp, const char *str, int len)
|
56
57
|
{
|
57
|
-
char *s = memcpy(mp_alloc(mp, len + 1), str, len);
|
58
|
+
char *s = (char *)memcpy(mp_alloc(mp, len + 1), str, len);
|
58
59
|
s[len] = '\0';
|
59
60
|
return s;
|
60
61
|
}
|
data/ext/mempool.h
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
#ifndef FRT_MEM_POOL_H
|
2
2
|
#define FRT_MEM_POOL_H
|
3
3
|
|
4
|
-
#
|
5
|
-
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#define FRT_MP_BUF_SIZE 65536
|
9
|
+
#define FRT_MP_INIT_CAPA 4
|
6
10
|
|
7
|
-
typedef struct
|
11
|
+
typedef struct FrtMemoryPool {
|
8
12
|
int buf_alloc;
|
9
13
|
int buf_capa;
|
10
14
|
int buf_pointer;
|
@@ -12,24 +16,28 @@ typedef struct MemoryPool {
|
|
12
16
|
int chunk_size;
|
13
17
|
char *curr_buffer;
|
14
18
|
char **buffers;
|
15
|
-
}
|
16
|
-
|
17
|
-
extern
|
18
|
-
extern
|
19
|
-
extern
|
20
|
-
extern void
|
21
|
-
extern void
|
22
|
-
extern char *
|
23
|
-
extern char *
|
24
|
-
extern void *
|
25
|
-
extern int
|
26
|
-
|
27
|
-
#define
|
28
|
-
#define
|
29
|
-
|
30
|
-
#define
|
31
|
-
(type*)memset(
|
32
|
-
#define
|
33
|
-
(type*)
|
19
|
+
} FrtMemoryPool;
|
20
|
+
|
21
|
+
extern FrtMemoryPool *frt_mp_new();
|
22
|
+
extern FrtMemoryPool *frt_mp_new_capa(int chunk_size, int init_capa);
|
23
|
+
extern FRT_INLINE void *frt_mp_alloc(FrtMemoryPool *mp, int size);
|
24
|
+
extern void frt_mp_reset(FrtMemoryPool *mp);
|
25
|
+
extern void frt_mp_destroy(FrtMemoryPool *mp);
|
26
|
+
extern char *frt_mp_strdup(FrtMemoryPool *mp, const char *str);
|
27
|
+
extern char *frt_mp_strndup(FrtMemoryPool *mp, const char *str, int len);
|
28
|
+
extern void *frt_mp_memdup(FrtMemoryPool *mp, const void *p, int len);
|
29
|
+
extern int frt_mp_used(FrtMemoryPool *mp);
|
30
|
+
|
31
|
+
#define FRT_MP_ALLOC_N(mp,type,n) (type *)frt_mp_alloc(mp, sizeof(type)*(n))
|
32
|
+
#define FRT_MP_ALLOC(mp,type) (type *)frt_mp_alloc(mp, sizeof(type))
|
33
|
+
|
34
|
+
#define FRT_MP_ALLOC_AND_ZERO(mp,type)\
|
35
|
+
(type*)memset(frt_mp_alloc(mp, sizeof(type)), 0, sizeof(type))
|
36
|
+
#define FRT_MP_ALLOC_AND_ZERO_N(mp,type,n)\
|
37
|
+
(type*)FRT_ZEROSET_N(frt_mp_alloc(mp, sizeof(type)*(n)), type, n)
|
38
|
+
|
39
|
+
#ifdef __cplusplus
|
40
|
+
} // extern "C"
|
41
|
+
#endif
|
34
42
|
|
35
43
|
#endif
|
data/ext/modules.h
CHANGED
@@ -4,7 +4,8 @@
|
|
4
4
|
* Do not edit manually.
|
5
5
|
*
|
6
6
|
* Modules included by this file are: danish, dutch, english, finnish, french,
|
7
|
-
* german, italian, norwegian, porter, portuguese,
|
7
|
+
* german, hungarian, italian, norwegian, porter, portuguese, romanian,
|
8
|
+
* russian, spanish, swedish, turkish
|
8
9
|
*/
|
9
10
|
|
10
11
|
#include "stem_ISO_8859_1_danish.h"
|
@@ -19,6 +20,8 @@
|
|
19
20
|
#include "stem_UTF_8_french.h"
|
20
21
|
#include "stem_ISO_8859_1_german.h"
|
21
22
|
#include "stem_UTF_8_german.h"
|
23
|
+
#include "stem_ISO_8859_1_hungarian.h"
|
24
|
+
#include "stem_UTF_8_hungarian.h"
|
22
25
|
#include "stem_ISO_8859_1_italian.h"
|
23
26
|
#include "stem_UTF_8_italian.h"
|
24
27
|
#include "stem_ISO_8859_1_norwegian.h"
|
@@ -27,34 +30,39 @@
|
|
27
30
|
#include "stem_UTF_8_porter.h"
|
28
31
|
#include "stem_ISO_8859_1_portuguese.h"
|
29
32
|
#include "stem_UTF_8_portuguese.h"
|
33
|
+
#include "stem_ISO_8859_2_romanian.h"
|
34
|
+
#include "stem_UTF_8_romanian.h"
|
30
35
|
#include "stem_KOI8_R_russian.h"
|
31
36
|
#include "stem_UTF_8_russian.h"
|
32
37
|
#include "stem_ISO_8859_1_spanish.h"
|
33
38
|
#include "stem_UTF_8_spanish.h"
|
34
39
|
#include "stem_ISO_8859_1_swedish.h"
|
35
40
|
#include "stem_UTF_8_swedish.h"
|
41
|
+
#include "stem_UTF_8_turkish.h"
|
36
42
|
|
37
43
|
typedef enum {
|
38
|
-
ENC_UNKNOWN,
|
44
|
+
ENC_UNKNOWN=0,
|
39
45
|
ENC_ISO_8859_1,
|
46
|
+
ENC_ISO_8859_2,
|
40
47
|
ENC_KOI8_R,
|
41
48
|
ENC_UTF_8
|
42
|
-
}
|
49
|
+
} stemmer_encoding_t;
|
43
50
|
|
44
51
|
struct stemmer_encoding {
|
45
52
|
const char * name;
|
46
|
-
|
53
|
+
stemmer_encoding_t enc;
|
47
54
|
};
|
48
55
|
static struct stemmer_encoding encodings[] = {
|
49
56
|
{"ISO_8859_1", ENC_ISO_8859_1},
|
57
|
+
{"ISO_8859_2", ENC_ISO_8859_2},
|
50
58
|
{"KOI8_R", ENC_KOI8_R},
|
51
59
|
{"UTF_8", ENC_UTF_8},
|
52
|
-
{0,
|
60
|
+
{0,ENC_UNKNOWN}
|
53
61
|
};
|
54
62
|
|
55
63
|
struct stemmer_modules {
|
56
64
|
const char * name;
|
57
|
-
|
65
|
+
stemmer_encoding_t enc;
|
58
66
|
struct SN_env * (*create)(void);
|
59
67
|
void (*close)(struct SN_env *);
|
60
68
|
int (*stem)(struct SN_env *);
|
@@ -102,6 +110,12 @@ static struct stemmer_modules modules[] = {
|
|
102
110
|
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
103
111
|
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
104
112
|
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
113
|
+
{"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
114
|
+
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
115
|
+
{"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
116
|
+
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
117
|
+
{"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
|
118
|
+
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
105
119
|
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
106
120
|
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
107
121
|
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
@@ -126,8 +140,16 @@ static struct stemmer_modules modules[] = {
|
|
126
140
|
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
127
141
|
{"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
128
142
|
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
143
|
+
{"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
144
|
+
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
145
|
+
{"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
146
|
+
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
147
|
+
{"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
148
|
+
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
129
149
|
{"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
130
150
|
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
151
|
+
{"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
152
|
+
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
131
153
|
{"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
132
154
|
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
133
155
|
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
@@ -142,7 +164,10 @@ static struct stemmer_modules modules[] = {
|
|
142
164
|
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
143
165
|
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
144
166
|
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
145
|
-
{
|
167
|
+
{"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
168
|
+
{"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
169
|
+
{"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
|
170
|
+
{0,ENC_UNKNOWN,0,0,0}
|
146
171
|
};
|
147
172
|
static const char * algorithm_names[] = {
|
148
173
|
"danish",
|
@@ -151,12 +176,15 @@ static const char * algorithm_names[] = {
|
|
151
176
|
"finnish",
|
152
177
|
"french",
|
153
178
|
"german",
|
179
|
+
"hungarian",
|
154
180
|
"italian",
|
155
181
|
"norwegian",
|
156
182
|
"porter",
|
157
183
|
"portuguese",
|
184
|
+
"romanian",
|
158
185
|
"russian",
|
159
186
|
"spanish",
|
160
187
|
"swedish",
|
188
|
+
"turkish",
|
161
189
|
0
|
162
190
|
};
|
data/ext/multimapper.c
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
#include "array.h"
|
3
3
|
#include "bitvector.h"
|
4
4
|
#include <string.h>
|
5
|
+
#include "internal.h"
|
5
6
|
|
6
7
|
#define St(state) ((State *)(state))
|
7
8
|
#define UCtoI(val) ((int)(unsigned char)(val))
|
@@ -161,7 +162,8 @@ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
|
161
162
|
|
162
163
|
static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
|
163
164
|
{
|
164
|
-
DeterministicState *current_state
|
165
|
+
DeterministicState *current_state
|
166
|
+
= (DeterministicState *)h_get(self->dstates_map, bv);
|
165
167
|
if (current_state == NULL) {
|
166
168
|
int bit, i;
|
167
169
|
int match_len = 0, max_match_len = 0;
|
@@ -263,7 +265,7 @@ int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
|
|
263
265
|
DeterministicState *state = start;
|
264
266
|
char *s = from, *d = to, *end = to + capa - 1;
|
265
267
|
if (self->d_size == 0) {
|
266
|
-
|
268
|
+
mulmap_compile(self);
|
267
269
|
}
|
268
270
|
while (*s && d < end) {
|
269
271
|
state = state->next[UCtoI(*s)];
|
@@ -292,6 +294,45 @@ char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
|
|
292
294
|
return to;
|
293
295
|
}
|
294
296
|
|
297
|
+
/* Maps a string to a dynamically allocated string */
|
298
|
+
char *mulmap_dynamic_map(MultiMapper *self, char *from)
|
299
|
+
{
|
300
|
+
DeterministicState *start = self->dstates[0];
|
301
|
+
DeterministicState *state = start;
|
302
|
+
int capa = strlen(from);
|
303
|
+
char *to = (char *)ecalloc(capa);
|
304
|
+
char *s = from, *d = to, *end = to + capa - 1;
|
305
|
+
if (self->d_size == 0) {
|
306
|
+
mulmap_compile(self);
|
307
|
+
}
|
308
|
+
do {
|
309
|
+
while (*s && d < end) {
|
310
|
+
state = state->next[UCtoI(*s)];
|
311
|
+
if (state->mapping) {
|
312
|
+
int len = state->mapping_len;
|
313
|
+
d -= (state->longest_match - 1);
|
314
|
+
if ((d + len) > end) {
|
315
|
+
len = end - d;
|
316
|
+
}
|
317
|
+
memcpy(d, state->mapping, len);
|
318
|
+
d += len;
|
319
|
+
state = start;
|
320
|
+
}
|
321
|
+
else {
|
322
|
+
*(d++) = *s;
|
323
|
+
}
|
324
|
+
s++;
|
325
|
+
}
|
326
|
+
if (*s) {
|
327
|
+
capa += 1024;
|
328
|
+
erealloc(to, capa);
|
329
|
+
end = to + capa - 1;
|
330
|
+
}
|
331
|
+
} while(*s);
|
332
|
+
*d = '\0';
|
333
|
+
return to;
|
334
|
+
}
|
335
|
+
|
295
336
|
void mulmap_destroy(MultiMapper *self)
|
296
337
|
{
|
297
338
|
if (--(self->ref_cnt) <= 0) {
|
data/ext/multimapper.h
CHANGED
@@ -1,51 +1,60 @@
|
|
1
1
|
#ifndef FRT_MAPPER_H
|
2
2
|
#define FRT_MAPPER_H
|
3
3
|
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
4
8
|
#include "hash.h"
|
5
9
|
|
6
|
-
typedef struct
|
10
|
+
typedef struct FrtState
|
7
11
|
{
|
8
|
-
int (*next)(struct
|
9
|
-
void (*destroy_i)(struct
|
10
|
-
int (*is_match)(struct
|
11
|
-
}
|
12
|
+
int (*next)(struct FrtState *self, int c, int *states);
|
13
|
+
void (*destroy_i)(struct FrtState *self);
|
14
|
+
int (*is_match)(struct FrtState *self, char **mapping);
|
15
|
+
} FrtState;
|
12
16
|
|
13
|
-
typedef struct
|
17
|
+
typedef struct FrtDeterministicState
|
14
18
|
{
|
15
|
-
struct
|
19
|
+
struct FrtDeterministicState *next[256];
|
16
20
|
int longest_match;
|
17
21
|
char *mapping;
|
18
22
|
int mapping_len;
|
19
|
-
}
|
23
|
+
} FrtDeterministicState;
|
20
24
|
|
21
|
-
typedef struct
|
25
|
+
typedef struct FrtMapping
|
22
26
|
{
|
23
27
|
char *pattern;
|
24
28
|
char *replacement;
|
25
|
-
}
|
29
|
+
} FrtMapping;
|
26
30
|
|
27
|
-
typedef struct
|
31
|
+
typedef struct FrtMultiMapper
|
28
32
|
{
|
29
|
-
|
33
|
+
FrtMapping **mappings;
|
30
34
|
int size;
|
31
35
|
int capa;
|
32
|
-
|
36
|
+
FrtDeterministicState **dstates;
|
33
37
|
int d_size;
|
34
38
|
int d_capa;
|
35
39
|
unsigned char alphabet[256];
|
36
40
|
int a_size;
|
37
|
-
|
38
|
-
|
41
|
+
FrtHash *dstates_map;
|
42
|
+
FrtState **nstates;
|
39
43
|
int nsize;
|
40
44
|
int *next_states;
|
41
45
|
int ref_cnt;
|
42
|
-
}
|
43
|
-
|
44
|
-
extern
|
45
|
-
extern void
|
46
|
-
extern void
|
47
|
-
extern char *
|
48
|
-
extern
|
49
|
-
extern
|
46
|
+
} FrtMultiMapper;
|
47
|
+
|
48
|
+
extern FrtMultiMapper *frt_mulmap_new();
|
49
|
+
extern void frt_mulmap_add_mapping(FrtMultiMapper *self, const char *p, const char *r);
|
50
|
+
extern void frt_mulmap_compile(FrtMultiMapper *self);
|
51
|
+
extern char *frt_mulmap_map(FrtMultiMapper *self, char *to, char *from, int capa);
|
52
|
+
extern char *frt_mulmap_dynamic_map(FrtMultiMapper *self, char *from);
|
53
|
+
extern int frt_mulmap_map_len(FrtMultiMapper *self, char *to, char *from, int capa);
|
54
|
+
extern void frt_mulmap_destroy(FrtMultiMapper *self);
|
55
|
+
|
56
|
+
#ifdef __cplusplus
|
57
|
+
} // extern "C"
|
58
|
+
#endif
|
50
59
|
|
51
60
|
#endif
|