ferret 0.11.6 → 0.11.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -0,0 +1,10 @@
1
+ #include "lang.h"
2
+ #include "internal.h"
3
+
4
+
5
+ struct timeval rb_time_interval _((VALUE));
6
+ extern void micro_sleep(const int micro_seconds)
7
+ {
8
+ rb_thread_wait_for(rb_time_interval(rb_float_new((double)micro_seconds/1000000.0)));
9
+ }
10
+
data/ext/lang.h CHANGED
@@ -10,39 +10,59 @@
10
10
  #undef rename
11
11
  #undef read
12
12
 
13
- #define frt_malloc xmalloc
14
- #define frt_calloc(n) xcalloc(n, 1)
15
- #define frt_realloc xrealloc
13
+ #define frt_emalloc xmalloc
14
+ #define frt_ecalloc(n) xcalloc(n, 1)
15
+ #define frt_erealloc xrealloc
16
+ /* FIXME: should eventually delete this */
17
+ #define FRT_REALLOC_N REALLOC_N
16
18
 
17
19
 
18
20
  #ifdef FRT_HAS_ISO_VARARGS
19
21
  /* C99-compliant compiler */
20
22
 
21
- # define FRT_EXIT(...) frt_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
22
- extern void frt_rb_raise(const char *file, int line_num, const char *func,
23
+ # define FRT_XEXIT(...) frb_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
24
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
23
25
  const char *err_type, const char *fmt, ...);
24
26
 
25
- # define V_FRT_EXIT(err_type, fmt, args) \
26
- vfrt_rb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
27
- extern void vfrt_rb_raise(const char *file, int line_num, const char *func,
27
+ # define FRT_VEXIT(err_type, fmt, args) \
28
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
29
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
28
30
  const char *err_type, const char *fmt, va_list args);
29
31
 
30
32
  #elif defined(FRT_HAS_GNUC_VARARGS)
31
33
  /* gcc has an extension */
32
34
 
33
- # define FRT_EXIT(args...) frt_rb_raise(__FILE__, __LINE__, __func__, ##args)
34
- extern void frt_rb_raise(const char *file, int line_num, const char *func,
35
+ # define FRT_XEXIT(args...) frb_rb_raise(__FILE__, __LINE__, __func__, ##args)
36
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
35
37
  const char *err_type, const char *fmt, ...);
36
38
 
37
- # define V_FRT_EXIT(err_type, fmt, args) \
38
- vfrt_rb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
39
- extern void vfrt_rb_raise(const char *file, int line_num, const char *func,
39
+ # define FRT_VEXIT(err_type, fmt, args) \
40
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
41
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
40
42
  const char *err_type, const char *fmt, va_list args);
41
43
  #else
42
44
  /* Can't do VARARGS */
43
45
 
44
- extern void FRT_EXIT(const char *err_type, const char *fmt, ...);
45
- extern void V_FRT_EXIT(const char *err_type, const char *fmt, va_list args);
46
+ extern void FRT_XEXIT(const char *err_type, const char *fmt, ...);
47
+ extern void FRT_VEXIT(const char *err_type, const char *fmt, va_list args);
48
+ #endif
49
+
50
+ #ifdef RUBY_RUBY_H
51
+ # define FRT_RUBY_VERSION_1_9
52
+ #endif
53
+
54
+ // ruby 1.8 compat with 1.9 to avoid ifdefs
55
+ #if !defined RSTRING_LEN
56
+ #define RSTRING_LEN(a) RSTRING(a)->len
57
+ #endif
58
+ #if !defined RSTRING_PTR
59
+ #define RSTRING_PTR(a) RSTRING(a)->ptr
60
+ #endif
61
+ #if !defined RARRAY_LEN
62
+ #define RARRAY_LEN(a) RARRAY(a)->len
63
+ #endif
64
+ #if !defined RARRAY_PTR
65
+ #define RARRAY_PTR(a) RARRAY(a)->ptr
46
66
  #endif
47
67
 
48
68
  #endif
@@ -1,6 +1,7 @@
1
1
  #include "global.h"
2
2
  #include "mempool.h"
3
3
  #include <string.h>
4
+ #include "internal.h"
4
5
 
5
6
  MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
6
7
  {
@@ -9,7 +10,7 @@ MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
9
10
  mp->buf_capa = init_buf_capa;
10
11
  mp->buffers = ALLOC_N(char *, init_buf_capa);
11
12
 
12
- mp->buffers[0] = mp->curr_buffer = emalloc(mp->chunk_size);
13
+ mp->buffers[0] = mp->curr_buffer = (char *)emalloc(mp->chunk_size);
13
14
  mp->buf_alloc = 1;
14
15
  mp->buf_pointer = 0;
15
16
  mp->pointer = 0;
@@ -38,7 +39,7 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
38
39
  mp->buf_capa <<= 1;
39
40
  REALLOC_N(mp->buffers, char *, mp->buf_capa);
40
41
  }
41
- mp->buffers[mp->buf_pointer] = emalloc(mp->chunk_size);
42
+ mp->buffers[mp->buf_pointer] = (char *)emalloc(mp->chunk_size);
42
43
  }
43
44
  p = mp->curr_buffer = mp->buffers[mp->buf_pointer];
44
45
  mp->pointer = size;
@@ -49,12 +50,12 @@ INLINE void *mp_alloc(MemoryPool *mp, int size)
49
50
  char *mp_strdup(MemoryPool *mp, const char *str)
50
51
  {
51
52
  int len = strlen(str) + 1;
52
- return memcpy(mp_alloc(mp, len), str, len);
53
+ return (char *)memcpy(mp_alloc(mp, len), str, len);
53
54
  }
54
55
 
55
56
  char *mp_strndup(MemoryPool *mp, const char *str, int len)
56
57
  {
57
- char *s = memcpy(mp_alloc(mp, len + 1), str, len);
58
+ char *s = (char *)memcpy(mp_alloc(mp, len + 1), str, len);
58
59
  s[len] = '\0';
59
60
  return s;
60
61
  }
@@ -1,10 +1,14 @@
1
1
  #ifndef FRT_MEM_POOL_H
2
2
  #define FRT_MEM_POOL_H
3
3
 
4
- #define MP_BUF_SIZE 65536
5
- #define MP_INIT_CAPA 4
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #define FRT_MP_BUF_SIZE 65536
9
+ #define FRT_MP_INIT_CAPA 4
6
10
 
7
- typedef struct MemoryPool {
11
+ typedef struct FrtMemoryPool {
8
12
  int buf_alloc;
9
13
  int buf_capa;
10
14
  int buf_pointer;
@@ -12,24 +16,28 @@ typedef struct MemoryPool {
12
16
  int chunk_size;
13
17
  char *curr_buffer;
14
18
  char **buffers;
15
- } MemoryPool;
16
-
17
- extern MemoryPool *mp_new();
18
- extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
19
- extern INLINE void *mp_alloc(MemoryPool *mp, int size);
20
- extern void mp_reset(MemoryPool *mp);
21
- extern void mp_destroy(MemoryPool *mp);
22
- extern char *mp_strdup(MemoryPool *mp, const char *str);
23
- extern char *mp_strndup(MemoryPool *mp, const char *str, int len);
24
- extern void *mp_memdup(MemoryPool *mp, const void *p, int len);
25
- extern int mp_used(MemoryPool *mp);
26
-
27
- #define MP_ALLOC_N(mp,type,n) (type *)mp_alloc(mp, sizeof(type)*(n))
28
- #define MP_ALLOC(mp,type) (type *)mp_alloc(mp, sizeof(type))
29
-
30
- #define MP_ALLOC_AND_ZERO(mp,type)\
31
- (type*)memset(mp_alloc(mp, sizeof(type)), 0, sizeof(type))
32
- #define MP_ALLOC_AND_ZERO_N(mp,type,n)\
33
- (type*)ZEROSET_N(mp_alloc(mp, sizeof(type)*(n)), type, n)
19
+ } FrtMemoryPool;
20
+
21
+ extern FrtMemoryPool *frt_mp_new();
22
+ extern FrtMemoryPool *frt_mp_new_capa(int chunk_size, int init_capa);
23
+ extern FRT_INLINE void *frt_mp_alloc(FrtMemoryPool *mp, int size);
24
+ extern void frt_mp_reset(FrtMemoryPool *mp);
25
+ extern void frt_mp_destroy(FrtMemoryPool *mp);
26
+ extern char *frt_mp_strdup(FrtMemoryPool *mp, const char *str);
27
+ extern char *frt_mp_strndup(FrtMemoryPool *mp, const char *str, int len);
28
+ extern void *frt_mp_memdup(FrtMemoryPool *mp, const void *p, int len);
29
+ extern int frt_mp_used(FrtMemoryPool *mp);
30
+
31
+ #define FRT_MP_ALLOC_N(mp,type,n) (type *)frt_mp_alloc(mp, sizeof(type)*(n))
32
+ #define FRT_MP_ALLOC(mp,type) (type *)frt_mp_alloc(mp, sizeof(type))
33
+
34
+ #define FRT_MP_ALLOC_AND_ZERO(mp,type)\
35
+ (type*)memset(frt_mp_alloc(mp, sizeof(type)), 0, sizeof(type))
36
+ #define FRT_MP_ALLOC_AND_ZERO_N(mp,type,n)\
37
+ (type*)FRT_ZEROSET_N(frt_mp_alloc(mp, sizeof(type)*(n)), type, n)
38
+
39
+ #ifdef __cplusplus
40
+ } // extern "C"
41
+ #endif
34
42
 
35
43
  #endif
@@ -4,7 +4,8 @@
4
4
  * Do not edit manually.
5
5
  *
6
6
  * Modules included by this file are: danish, dutch, english, finnish, french,
7
- * german, italian, norwegian, porter, portuguese, russian, spanish, swedish
7
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ * russian, spanish, swedish, turkish
8
9
  */
9
10
 
10
11
  #include "stem_ISO_8859_1_danish.h"
@@ -19,6 +20,8 @@
19
20
  #include "stem_UTF_8_french.h"
20
21
  #include "stem_ISO_8859_1_german.h"
21
22
  #include "stem_UTF_8_german.h"
23
+ #include "stem_ISO_8859_1_hungarian.h"
24
+ #include "stem_UTF_8_hungarian.h"
22
25
  #include "stem_ISO_8859_1_italian.h"
23
26
  #include "stem_UTF_8_italian.h"
24
27
  #include "stem_ISO_8859_1_norwegian.h"
@@ -27,34 +30,39 @@
27
30
  #include "stem_UTF_8_porter.h"
28
31
  #include "stem_ISO_8859_1_portuguese.h"
29
32
  #include "stem_UTF_8_portuguese.h"
33
+ #include "stem_ISO_8859_2_romanian.h"
34
+ #include "stem_UTF_8_romanian.h"
30
35
  #include "stem_KOI8_R_russian.h"
31
36
  #include "stem_UTF_8_russian.h"
32
37
  #include "stem_ISO_8859_1_spanish.h"
33
38
  #include "stem_UTF_8_spanish.h"
34
39
  #include "stem_ISO_8859_1_swedish.h"
35
40
  #include "stem_UTF_8_swedish.h"
41
+ #include "stem_UTF_8_turkish.h"
36
42
 
37
43
  typedef enum {
38
- ENC_UNKNOWN,
44
+ ENC_UNKNOWN=0,
39
45
  ENC_ISO_8859_1,
46
+ ENC_ISO_8859_2,
40
47
  ENC_KOI8_R,
41
48
  ENC_UTF_8
42
- } stemmer_encoding;
49
+ } stemmer_encoding_t;
43
50
 
44
51
  struct stemmer_encoding {
45
52
  const char * name;
46
- stemmer_encoding enc;
53
+ stemmer_encoding_t enc;
47
54
  };
48
55
  static struct stemmer_encoding encodings[] = {
49
56
  {"ISO_8859_1", ENC_ISO_8859_1},
57
+ {"ISO_8859_2", ENC_ISO_8859_2},
50
58
  {"KOI8_R", ENC_KOI8_R},
51
59
  {"UTF_8", ENC_UTF_8},
52
- {0,0}
60
+ {0,ENC_UNKNOWN}
53
61
  };
54
62
 
55
63
  struct stemmer_modules {
56
64
  const char * name;
57
- stemmer_encoding enc;
65
+ stemmer_encoding_t enc;
58
66
  struct SN_env * (*create)(void);
59
67
  void (*close)(struct SN_env *);
60
68
  int (*stem)(struct SN_env *);
@@ -102,6 +110,12 @@ static struct stemmer_modules modules[] = {
102
110
  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
103
111
  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
104
112
  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
+ {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
114
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
+ {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
116
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
+ {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
118
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
105
119
  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
106
120
  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
107
121
  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
@@ -126,8 +140,16 @@ static struct stemmer_modules modules[] = {
126
140
  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
127
141
  {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
128
142
  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
143
+ {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
144
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
145
+ {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
146
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
147
+ {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
148
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
129
149
  {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
130
150
  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
151
+ {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
152
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
131
153
  {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
132
154
  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
133
155
  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
@@ -142,7 +164,10 @@ static struct stemmer_modules modules[] = {
142
164
  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
143
165
  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
144
166
  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
145
- {0,0,0,0,0}
167
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
170
+ {0,ENC_UNKNOWN,0,0,0}
146
171
  };
147
172
  static const char * algorithm_names[] = {
148
173
  "danish",
@@ -151,12 +176,15 @@ static const char * algorithm_names[] = {
151
176
  "finnish",
152
177
  "french",
153
178
  "german",
179
+ "hungarian",
154
180
  "italian",
155
181
  "norwegian",
156
182
  "porter",
157
183
  "portuguese",
184
+ "romanian",
158
185
  "russian",
159
186
  "spanish",
160
187
  "swedish",
188
+ "turkish",
161
189
  0
162
190
  };
@@ -2,6 +2,7 @@
2
2
  #include "array.h"
3
3
  #include "bitvector.h"
4
4
  #include <string.h>
5
+ #include "internal.h"
5
6
 
6
7
  #define St(state) ((State *)(state))
7
8
  #define UCtoI(val) ((int)(unsigned char)(val))
@@ -161,7 +162,8 @@ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
161
162
 
162
163
  static DeterministicState *mulmap_process_state(MultiMapper *self, BitVector *bv)
163
164
  {
164
- DeterministicState *current_state = h_get(self->dstates_map, bv);
165
+ DeterministicState *current_state
166
+ = (DeterministicState *)h_get(self->dstates_map, bv);
165
167
  if (current_state == NULL) {
166
168
  int bit, i;
167
169
  int match_len = 0, max_match_len = 0;
@@ -263,7 +265,7 @@ int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa)
263
265
  DeterministicState *state = start;
264
266
  char *s = from, *d = to, *end = to + capa - 1;
265
267
  if (self->d_size == 0) {
266
- RAISE(STATE_ERROR, "You forgot to compile your MultiMapper");
268
+ mulmap_compile(self);
267
269
  }
268
270
  while (*s && d < end) {
269
271
  state = state->next[UCtoI(*s)];
@@ -292,6 +294,45 @@ char *mulmap_map(MultiMapper *self, char *to, char *from, int capa)
292
294
  return to;
293
295
  }
294
296
 
297
+ /* Maps a string to a dynamically allocated string */
298
+ char *mulmap_dynamic_map(MultiMapper *self, char *from)
299
+ {
300
+ DeterministicState *start = self->dstates[0];
301
+ DeterministicState *state = start;
302
+ int capa = strlen(from);
303
+ char *to = (char *)ecalloc(capa);
304
+ char *s = from, *d = to, *end = to + capa - 1;
305
+ if (self->d_size == 0) {
306
+ mulmap_compile(self);
307
+ }
308
+ do {
309
+ while (*s && d < end) {
310
+ state = state->next[UCtoI(*s)];
311
+ if (state->mapping) {
312
+ int len = state->mapping_len;
313
+ d -= (state->longest_match - 1);
314
+ if ((d + len) > end) {
315
+ len = end - d;
316
+ }
317
+ memcpy(d, state->mapping, len);
318
+ d += len;
319
+ state = start;
320
+ }
321
+ else {
322
+ *(d++) = *s;
323
+ }
324
+ s++;
325
+ }
326
+ if (*s) {
327
+ capa += 1024;
328
+ erealloc(to, capa);
329
+ end = to + capa - 1;
330
+ }
331
+ } while(*s);
332
+ *d = '\0';
333
+ return to;
334
+ }
335
+
295
336
  void mulmap_destroy(MultiMapper *self)
296
337
  {
297
338
  if (--(self->ref_cnt) <= 0) {
@@ -1,51 +1,60 @@
1
1
  #ifndef FRT_MAPPER_H
2
2
  #define FRT_MAPPER_H
3
3
 
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
4
8
  #include "hash.h"
5
9
 
6
- typedef struct State
10
+ typedef struct FrtState
7
11
  {
8
- int (*next)(struct State *self, int c, int *states);
9
- void (*destroy_i)(struct State *self);
10
- int (*is_match)(struct State *self, char **mapping);
11
- } State;
12
+ int (*next)(struct FrtState *self, int c, int *states);
13
+ void (*destroy_i)(struct FrtState *self);
14
+ int (*is_match)(struct FrtState *self, char **mapping);
15
+ } FrtState;
12
16
 
13
- typedef struct DeterministicState
17
+ typedef struct FrtDeterministicState
14
18
  {
15
- struct DeterministicState *next[256];
19
+ struct FrtDeterministicState *next[256];
16
20
  int longest_match;
17
21
  char *mapping;
18
22
  int mapping_len;
19
- } DeterministicState;
23
+ } FrtDeterministicState;
20
24
 
21
- typedef struct Mapping
25
+ typedef struct FrtMapping
22
26
  {
23
27
  char *pattern;
24
28
  char *replacement;
25
- } Mapping;
29
+ } FrtMapping;
26
30
 
27
- typedef struct MultiMapper
31
+ typedef struct FrtMultiMapper
28
32
  {
29
- Mapping **mappings;
33
+ FrtMapping **mappings;
30
34
  int size;
31
35
  int capa;
32
- DeterministicState **dstates;
36
+ FrtDeterministicState **dstates;
33
37
  int d_size;
34
38
  int d_capa;
35
39
  unsigned char alphabet[256];
36
40
  int a_size;
37
- HashTable *dstates_map;
38
- State **nstates;
41
+ FrtHash *dstates_map;
42
+ FrtState **nstates;
39
43
  int nsize;
40
44
  int *next_states;
41
45
  int ref_cnt;
42
- } MultiMapper;
43
-
44
- extern MultiMapper *mulmap_new();
45
- extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
46
- extern void mulmap_compile(MultiMapper *self);
47
- extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
48
- extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
49
- extern void mulmap_destroy(MultiMapper *self);
46
+ } FrtMultiMapper;
47
+
48
+ extern FrtMultiMapper *frt_mulmap_new();
49
+ extern void frt_mulmap_add_mapping(FrtMultiMapper *self, const char *p, const char *r);
50
+ extern void frt_mulmap_compile(FrtMultiMapper *self);
51
+ extern char *frt_mulmap_map(FrtMultiMapper *self, char *to, char *from, int capa);
52
+ extern char *frt_mulmap_dynamic_map(FrtMultiMapper *self, char *from);
53
+ extern int frt_mulmap_map_len(FrtMultiMapper *self, char *to, char *from, int capa);
54
+ extern void frt_mulmap_destroy(FrtMultiMapper *self);
55
+
56
+ #ifdef __cplusplus
57
+ } // extern "C"
58
+ #endif
50
59
 
51
60
  #endif