ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -0,0 +1,10 @@
1
+ #include "lang.h"
2
+ #include "internal.h"
3
+
4
+
5
+ struct timeval rb_time_interval _((VALUE));
6
+ extern void micro_sleep(const int micro_seconds)
7
+ {
8
+ rb_thread_wait_for(rb_time_interval(rb_float_new((double)micro_seconds/1000000.0)));
9
+ }
10
+
data/ext/lang.h CHANGED
@@ -10,39 +10,59 @@
10
10
  #undef rename
11
11
  #undef read
12
12
 
13
- #define frt_malloc xmalloc
14
- #define frt_calloc(n) xcalloc(n, 1)
15
- #define frt_realloc xrealloc
13
+ #define frt_emalloc xmalloc
14
+ #define frt_ecalloc(n) xcalloc(n, 1)
15
+ #define frt_erealloc xrealloc
16
+ /* FIXME: should eventually delete this */
17
+ #define FRT_REALLOC_N REALLOC_N
16
18
 
17
19
 
18
20
  #ifdef FRT_HAS_ISO_VARARGS
19
21
  /* C99-compliant compiler */
20
22
 
21
- # define FRT_EXIT(...) frt_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
22
- extern void frt_rb_raise(const char *file, int line_num, const char *func,
23
+ # define FRT_XEXIT(...) frb_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
24
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
23
25
  const char *err_type, const char *fmt, ...);
24
26
 
25
- # define V_FRT_EXIT(err_type, fmt, args) \
26
- vfrt_rb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
27
- extern void vfrt_rb_raise(const char *file, int line_num, const char *func,
27
+ # define FRT_VEXIT(err_type, fmt, args) \
28
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
29
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
28
30
  const char *err_type, const char *fmt, va_list args);
29
31
 
30
32
  #elif defined(FRT_HAS_GNUC_VARARGS)
31
33
  /* gcc has an extension */
32
34
 
33
- # define FRT_EXIT(args...) frt_rb_raise(__FILE__, __LINE__, __func__, ##args)
34
- extern void frt_rb_raise(const char *file, int line_num, const char *func,
35
+ # define FRT_XEXIT(args...) frb_rb_raise(__FILE__, __LINE__, __func__, ##args)
36
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
35
37
  const char *err_type, const char *fmt, ...);
36
38
 
37
- # define V_FRT_EXIT(err_type, fmt, args) \
38
- vfrt_rb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
39
- extern void vfrt_rb_raise(const char *file, int line_num, const char *func,
39
+ # define FRT_VEXIT(err_type, fmt, args) \
40
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
41
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
40
42
  const char *err_type, const char *fmt, va_list args);
41
43
  #else
42
44
  /* Can't do VARARGS */
43
45
 
44
- extern void FRT_EXIT(const char *err_type, const char *fmt, ...);
45
- extern void V_FRT_EXIT(const char *err_type, const char *fmt, va_list args);
46
+ extern void FRT_XEXIT(const char *err_type, const char *fmt, ...);
47
+ extern void FRT_VEXIT(const char *err_type, const char *fmt, va_list args);
48
+ #endif
49
+
50
+ #ifdef RUBY_RUBY_H
51
+ # define FRT_RUBY_VERSION_1_9
52
+ #endif
53
+
54
+ // ruby 1.8 compat with 1.9 to avoid ifdefs
55
+ #if !defined RSTRING_LEN
56
+ #define RSTRING_LEN(a) RSTRING(a)->len
57
+ #endif
58
+ #if !defined RSTRING_PTR
59
+ #define RSTRING_PTR(a) RSTRING(a)->ptr
60
+ #endif
61
+ #if !defined RARRAY_LEN
62
+ #define RARRAY_LEN(a) RARRAY(a)->len
63
+ #endif
64
+ #if !defined RARRAY_PTR
65
+ #define RARRAY_PTR(a) RARRAY(a)->ptr
46
66
  #endif
47
67
 
48
68
  #endif
@@ -1,6 +1,7 @@
1
1
  #include "global.h"
2
2
  #include "mempool.h"
3
3
  #include <string.h>
4
+ #include "internal.h"
4
5
 
5
6
  MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
6
7
  {
@@ -9,7 +10,7 @@ MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
9
10
  mp->buf_capa = init_buf_capa;
10
11
  mp->buffers = ALLOC_N(char *, init_buf_capa);
11
12
 
12
- mp->buffers[0] = mp->curr_buffer = emalloc(mp->chunk_size);
13
+ mp->buffers[0] = mp->curr_buffer = (char *)emalloc(mp->chunk_size);
13
14
  mp->buf_alloc = 1;
14
15
  mp->buf_pointer = 0;
15
16
  mp->pointer = 0;
@@ -38,7 +39,7 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
38
39
  mp->buf_capa <<= 1;
39
40
  REALLOC_N(mp->buffers, char *, mp->buf_capa);
40
41
  }
41
- mp->buffers[mp->buf_pointer] = emalloc(mp->chunk_size);
42
+ mp->buffers[mp->buf_pointer] = (char *)emalloc(mp->chunk_size);
42
43
  }
43
44
  p = mp->curr_buffer = mp->buffers[mp->buf_pointer];
44
45
  mp->pointer = size;
@@ -49,12 +50,12 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
49
50
  char *mp_strdup(MemoryPool *mp, const char *str)
50
51
  {
51
52
  int len = strlen(str) + 1;
52
- return memcpy(mp_alloc(mp, len), str, len);
53
+ return (char *)memcpy(mp_alloc(mp, len), str, len);
53
54
  }
54
55
 
55
56
  char *mp_strndup(MemoryPool *mp, const char *str, int len)
56
57
  {
57
- char *s = memcpy(mp_alloc(mp, len + 1), str, len);
58
+ char *s = (char *)memcpy(mp_alloc(mp, len + 1), str, len);
58
59
  s[len] = '\0';
59
60
  return s;
60
61
  }
@@ -1,10 +1,14 @@
1
1
  #ifndef FRT_MEM_POOL_H
2
2
  #define FRT_MEM_POOL_H
3
3
 
4
- #define MP_BUF_SIZE 65536
5
- #define MP_INIT_CAPA 4
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #define FRT_MP_BUF_SIZE 65536
9
+ #define FRT_MP_INIT_CAPA 4
6
10
 
7
- typedef struct MemoryPool {
11
+ typedef struct FrtMemoryPool {
8
12
  int buf_alloc;
9
13
  int buf_capa;
10
14
  int buf_pointer;
@@ -12,24 +16,28 @@ typedef struct MemoryPool {
12
16
  int chunk_size;
13
17
  char *curr_buffer;
14
18
  char **buffers;
15
- } MemoryPool;
16
-
17
- extern MemoryPool *mp_new();
18
- extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
19
- extern INLINE void *mp_alloc(MemoryPool *mp, int size);
20
- extern void mp_reset(MemoryPool *mp);
21
- extern void mp_destroy(MemoryPool *mp);
22
- extern char *mp_strdup(MemoryPool *mp, const char *str);
23
- extern char *mp_strndup(MemoryPool *mp, const char *str, int len);
24
- extern void *mp_memdup(MemoryPool *mp, const void *p, int len);
25
- extern int mp_used(MemoryPool *mp);
26
-
27
- #define MP_ALLOC_N(mp,type,n) (type *)mp_alloc(mp, sizeof(type)*(n))
28
- #define MP_ALLOC(mp,type) (type *)mp_alloc(mp, sizeof(type))
29
-
30
- #define MP_ALLOC_AND_ZERO(mp,type)\
31
- (type*)memset(mp_alloc(mp, sizeof(type)), 0, sizeof(type))
32
- #define MP_ALLOC_AND_ZERO_N(mp,type,n)\
33
- (type*)ZEROSET_N(mp_alloc(mp, sizeof(type)*(n)), type, n)
19
+ } FrtMemoryPool;
20
+
21
+ extern FrtMemoryPool *frt_mp_new();
22
+ extern FrtMemoryPool *frt_mp_new_capa(int chunk_size, int init_capa);
23
+ extern FRT_INLINE void *frt_mp_alloc(FrtMemoryPool *mp, int size);
24
+ extern void frt_mp_reset(FrtMemoryPool *mp);
25
+ extern void frt_mp_destroy(FrtMemoryPool *mp);
26
+ extern char *frt_mp_strdup(FrtMemoryPool *mp, const char *str);
27
+ extern char *frt_mp_strndup(FrtMemoryPool *mp, const char *str, int len);
28
+ extern void *frt_mp_memdup(FrtMemoryPool *mp, const void *p, int len);
29
+ extern int frt_mp_used(FrtMemoryPool *mp);
30
+
31
+ #define FRT_MP_ALLOC_N(mp,type,n) (type *)frt_mp_alloc(mp, sizeof(type)*(n))
32
+ #define FRT_MP_ALLOC(mp,type) (type *)frt_mp_alloc(mp, sizeof(type))
33
+
34
+ #define FRT_MP_ALLOC_AND_ZERO(mp,type)\
35
+ (type*)memset(frt_mp_alloc(mp, sizeof(type)), 0, sizeof(type))
36
+ #define FRT_MP_ALLOC_AND_ZERO_N(mp,type,n)\
37
+ (type*)FRT_ZEROSET_N(frt_mp_alloc(mp, sizeof(type)*(n)), type, n)
38
+
39
+ #ifdef __cplusplus
40
+ } // extern "C"
41
+ #endif
34
42
 
35
43
  #endif
@@ -4,7 +4,8 @@
4
4
  * Do not edit manually.
5
5
  *
6
6
  * Modules included by this file are: danish, dutch, english, finnish, french,
7
- * german, italian, norwegian, porter, portuguese, russian, spanish, swedish
7
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ * russian, spanish, swedish, turkish
8
9
  */
9
10
 
10
11
  #include "stem_ISO_8859_1_danish.h"
@@ -19,6 +20,8 @@
19
20
  #include "stem_UTF_8_french.h"
20
21
  #include "stem_ISO_8859_1_german.h"
21
22
  #include "stem_UTF_8_german.h"
23
+ #include "stem_ISO_8859_1_hungarian.h"
24
+ #include "stem_UTF_8_hungarian.h"
22
25
  #include "stem_ISO_8859_1_italian.h"
23
26
  #include "stem_UTF_8_italian.h"
24
27
  #include "stem_ISO_8859_1_norwegian.h"
@@ -27,34 +30,39 @@
27
30
  #include "stem_UTF_8_porter.h"
28
31
  #include "stem_ISO_8859_1_portuguese.h"
29
32
  #include "stem_UTF_8_portuguese.h"
33
+ #include "stem_ISO_8859_2_romanian.h"
34
+ #include "stem_UTF_8_romanian.h"
30
35
  #include "stem_KOI8_R_russian.h"
31
36
  #include "stem_UTF_8_russian.h"
32
37
  #include "stem_ISO_8859_1_spanish.h"
33
38
  #include "stem_UTF_8_spanish.h"
34
39
  #include "stem_ISO_8859_1_swedish.h"
35
40
  #include "stem_UTF_8_swedish.h"
41
+ #include "stem_UTF_8_turkish.h"
36
42
 
37
43
  typedef enum {
38
- ENC_UNKNOWN,
44
+ ENC_UNKNOWN=0,
39
45
  ENC_ISO_8859_1,
46
+ ENC_ISO_8859_2,
40
47
  ENC_KOI8_R,
41
48
  ENC_UTF_8
42
- } stemmer_encoding;
49
+ } stemmer_encoding_t;
43
50
 
44
51
  struct stemmer_encoding {
45
52
  const char * name;
46
- stemmer_encoding enc;
53
+ stemmer_encoding_t enc;
47
54
  };
48
55
  static struct stemmer_encoding encodings[] = {
49
56
  {"ISO_8859_1", ENC_ISO_8859_1},
57
+ {"ISO_8859_2", ENC_ISO_8859_2},
50
58
  {"KOI8_R", ENC_KOI8_R},
51
59
  {"UTF_8", ENC_UTF_8},
52
- {0,0}
60
+ {0,ENC_UNKNOWN}
53
61
  };
54
62
 
55
63
  struct stemmer_modules {
56
64
  const char * name;
57
- stemmer_encoding enc;
65
+ stemmer_encoding_t enc;
58
66
  struct SN_env * (*create)(void);
59
67
  void (*close)(struct SN_env *);
60
68
  int (*stem)(struct SN_env *);
@@ -102,6 +110,12 @@ static struct stemmer_modules modules[] = {
102
110
  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
103
111
  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
104
112
  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
+ {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
114
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
+ {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
116
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
+ {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
118
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
105
119
  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
106
120
  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
107
121
  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
@@ -126,8 +140,16 @@ static struct stemmer_modules modules[] = {
126
140
  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
127
141
  {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
128
142
  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
143
+ {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
144
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
145
+ {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
146
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
147
+ {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
148
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
129
149
  {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
130
150
  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
151
+ {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
152
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
131
153
  {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
132
154
  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
133
155
  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
@@ -142,7 +164,10 @@ static struct stemmer_modules modules[] = {
142
164
  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
143
165
  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
144
166
  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
145
- {0,0,0,0,0}
167
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
170
+ {0,ENC_UNKNOWN,0,0,0}
146
171
  };
147
172
  static const char * algorithm_names[] = {
148
173
  "danish",
@@ -151,12 +176,15 @@ static const char * algorithm_names[] = {
151
176
  "finnish",
152
177
  "french",
153
178
  "german",
179
+ "hungarian",
154
180
  "italian",
155
181
  "norwegian",
156
182
  "porter",
157
183
  "portuguese",
184
+ "romanian",
158
185
  "russian",
159
186
  "spanish",
160
187
  "swedish",
188
+ "turkish",
161
189
  0
162
190
  };
@@ -2,6 +2,7 @@
2
2
  #include "array.h"
3
3
  #include "bitvector.h"
4
4
  #include <string.h>
5
+ #include "internal.h"
5
6
 
6
7
  #define St(state) ((State *)(state))
7
8
  #define UCtoI(val) ((int)(unsigned char)(val))
@@ -161,7 +162,8 @@ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
161
162
 
162
163
  static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
163
164
  {
164
- DeterministicState *current_state = h_get(self->dstates_map, bv);
165
+ DeterministicState *current_state
166
+ = (DeterministicState *)h_get(self->dstates_map, bv);
165
167
  if (current_state == NULL) {
166
168
  int bit, i;
167
169
  int match_len = 0, max_match_len = 0;
@@ -263,7 +265,7 @@ int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
263
265
  DeterministicState *state = start;
264
266
  char *s = from, *d = to, *end = to + capa - 1;
265
267
  if (self->d_size == 0) {
266
- RAISE(STATE_ERROR, "You forgot to compile your MultiMapper");
268
+ mulmap_compile(self);
267
269
  }
268
270
  while (*s && d < end) {
269
271
  state = state->next[UCtoI(*s)];
@@ -292,6 +294,45 @@ char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
292
294
  return to;
293
295
  }
294
296
 
297
+ /* Maps a string to a dynamically allocated string */
298
+ char *mulmap_dynamic_map(MultiMapper *self, char *from)
299
+ {
300
+ DeterministicState *start = self->dstates[0];
301
+ DeterministicState *state = start;
302
+ int capa = strlen(from);
303
+ char *to = (char *)ecalloc(capa);
304
+ char *s = from, *d = to, *end = to + capa - 1;
305
+ if (self->d_size == 0) {
306
+ mulmap_compile(self);
307
+ }
308
+ do {
309
+ while (*s && d < end) {
310
+ state = state->next[UCtoI(*s)];
311
+ if (state->mapping) {
312
+ int len = state->mapping_len;
313
+ d -= (state->longest_match - 1);
314
+ if ((d + len) > end) {
315
+ len = end - d;
316
+ }
317
+ memcpy(d, state->mapping, len);
318
+ d += len;
319
+ state = start;
320
+ }
321
+ else {
322
+ *(d++) = *s;
323
+ }
324
+ s++;
325
+ }
326
+ if (*s) {
327
+ capa += 1024;
328
+ erealloc(to, capa);
329
+ end = to + capa - 1;
330
+ }
331
+ } while(*s);
332
+ *d = '\0';
333
+ return to;
334
+ }
335
+
295
336
  void mulmap_destroy(MultiMapper *self)
296
337
  {
297
338
  if (--(self->ref_cnt) <= 0) {
@@ -1,51 +1,60 @@
1
1
  #ifndef FRT_MAPPER_H
2
2
  #define FRT_MAPPER_H
3
3
 
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
4
8
  #include "hash.h"
5
9
 
6
- typedef struct State
10
+ typedef struct FrtState
7
11
  {
8
- int (*next)(struct State *self, int c, int *states);
9
- void (*destroy_i)(struct State *self);
10
- int (*is_match)(struct State *self, char **mapping);
11
- } State;
12
+ int (*next)(struct FrtState *self, int c, int *states);
13
+ void (*destroy_i)(struct FrtState *self);
14
+ int (*is_match)(struct FrtState *self, char **mapping);
15
+ } FrtState;
12
16
 
13
- typedef struct DeterministicState
17
+ typedef struct FrtDeterministicState
14
18
  {
15
- struct DeterministicState *next[256];
19
+ struct FrtDeterministicState *next[256];
16
20
  int longest_match;
17
21
  char *mapping;
18
22
  int mapping_len;
19
- } DeterministicState;
23
+ } FrtDeterministicState;
20
24
 
21
- typedef struct Mapping
25
+ typedef struct FrtMapping
22
26
  {
23
27
  char *pattern;
24
28
  char *replacement;
25
- } Mapping;
29
+ } FrtMapping;
26
30
 
27
- typedef struct MultiMapper
31
+ typedef struct FrtMultiMapper
28
32
  {
29
- Mapping **mappings;
33
+ FrtMapping **mappings;
30
34
  int size;
31
35
  int capa;
32
- DeterministicState **dstates;
36
+ FrtDeterministicState **dstates;
33
37
  int d_size;
34
38
  int d_capa;
35
39
  unsigned char alphabet[256];
36
40
  int a_size;
37
- HashTable *dstates_map;
38
- State **nstates;
41
+ FrtHash *dstates_map;
42
+ FrtState **nstates;
39
43
  int nsize;
40
44
  int *next_states;
41
45
  int ref_cnt;
42
- } MultiMapper;
43
-
44
- extern MultiMapper *mulmap_new();
45
- extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
46
- extern void mulmap_compile(MultiMapper *self);
47
- extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
48
- extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
49
- extern void mulmap_destroy(MultiMapper *self);
46
+ } FrtMultiMapper;
47
+
48
+ extern FrtMultiMapper *frt_mulmap_new();
49
+ extern void frt_mulmap_add_mapping(FrtMultiMapper *self, const char *p, const char *r);
50
+ extern void frt_mulmap_compile(FrtMultiMapper *self);
51
+ extern char *frt_mulmap_map(FrtMultiMapper *self, char *to, char *from, int capa);
52
+ extern char *frt_mulmap_dynamic_map(FrtMultiMapper *self, char *from);
53
+ extern int frt_mulmap_map_len(FrtMultiMapper *self, char *to, char *from, int capa);
54
+ extern void frt_mulmap_destroy(FrtMultiMapper *self);
55
+
56
+ #ifdef __cplusplus
57
+ } // extern "C"
58
+ #endif
50
59
 
51
60
  #endif