jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/lang.c ADDED
@@ -0,0 +1,10 @@
1
+ #include "lang.h"
2
+ #include "internal.h"
3
+
4
+
5
+ struct timeval rb_time_interval _((VALUE));
6
+ extern void micro_sleep(const int micro_seconds)
7
+ {
8
+ rb_thread_wait_for(rb_time_interval(rb_float_new((double)micro_seconds/1000000.0)));
9
+ }
10
+
data/ext/lang.h ADDED
@@ -0,0 +1,68 @@
1
+ #ifndef FRT_LANG_H
2
+ #define FRT_LANG_H
3
+
4
+ #define RUBY_BINDINGS 1
5
+
6
+ #include <stdarg.h>
7
+ #include <ruby.h>
8
+
9
+ #undef close
10
+ #undef rename
11
+ #undef read
12
+
13
+ #define frt_emalloc xmalloc
14
+ #define frt_ecalloc(n) xcalloc(n, 1)
15
+ #define frt_erealloc xrealloc
16
+ /* FIXME: should eventually delete this */
17
+ #define FRT_REALLOC_N REALLOC_N
18
+
19
+
20
+ #ifdef FRT_HAS_ISO_VARARGS
21
+ /* C99-compliant compiler */
22
+
23
+ # define FRT_XEXIT(...) frb_rb_raise(__FILE__, __LINE__, __func__, __VA_ARGS__)
24
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
25
+ const char *err_type, const char *fmt, ...);
26
+
27
+ # define FRT_VEXIT(err_type, fmt, args) \
28
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
29
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
30
+ const char *err_type, const char *fmt, va_list args);
31
+
32
+ #elif defined(FRT_HAS_GNUC_VARARGS)
33
+ /* gcc has an extension */
34
+
35
+ # define FRT_XEXIT(args...) frb_rb_raise(__FILE__, __LINE__, __func__, ##args)
36
+ extern void frb_rb_raise(const char *file, int line_num, const char *func,
37
+ const char *err_type, const char *fmt, ...);
38
+
39
+ # define FRT_VEXIT(err_type, fmt, args) \
40
+ frb_vrb_raise(__FILE__, __LINE__, __func__, err_type, fmt, args)
41
+ extern void frb_vrb_raise(const char *file, int line_num, const char *func,
42
+ const char *err_type, const char *fmt, va_list args);
43
+ #else
44
+ /* Can't do VARARGS */
45
+
46
+ extern void FRT_XEXIT(const char *err_type, const char *fmt, ...);
47
+ extern void FRT_VEXIT(const char *err_type, const char *fmt, va_list args);
48
+ #endif
49
+
50
+ #ifdef RUBY_RUBY_H
51
+ # define FRT_RUBY_VERSION_1_9
52
+ #endif
53
+
54
+ // ruby 1.8 compat with 1.9 to avoid ifdefs
55
+ #if !defined RSTRING_LEN
56
+ #define RSTRING_LEN(a) RSTRING(a)->len
57
+ #endif
58
+ #if !defined RSTRING_PTR
59
+ #define RSTRING_PTR(a) RSTRING(a)->ptr
60
+ #endif
61
+ #if !defined RARRAY_LEN
62
+ #define RARRAY_LEN(a) RARRAY(a)->len
63
+ #endif
64
+ #if !defined RARRAY_PTR
65
+ #define RARRAY_PTR(a) RARRAY(a)->ptr
66
+ #endif
67
+
68
+ #endif
data/ext/libstemmer.h ADDED
@@ -0,0 +1,79 @@
1
+
2
+ /* Make header file work when included from C++ */
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ struct sb_stemmer;
8
+ typedef unsigned char sb_symbol;
9
+
10
+ /* FIXME - should be able to get a version number for each stemming
11
+ * algorithm (which will be incremented each time the output changes). */
12
+
13
+ /** Returns an array of the names of the available stemming algorithms.
14
+ * Note that these are the canonical names - aliases (ie, other names for
15
+ * the same algorithm) will not be included in the list.
16
+ * The list is terminated with a null pointer.
17
+ *
18
+ * The list must not be modified in any way.
19
+ */
20
+ const char ** sb_stemmer_list(void);
21
+
22
+ /** Create a new stemmer object, using the specified algorithm, for the
23
+ * specified character encoding.
24
+ *
25
+ * All algorithms will usually be available in UTF-8, but may also be
26
+ * available in other character encodings.
27
+ *
28
+ * @param algorithm The algorithm name. This is either the english
29
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
30
+ * language. Note that case is significant in this parameter - the
31
+ * value should be supplied in lower case.
32
+ *
33
+ * @param charenc The character encoding. NULL may be passed as
34
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
35
+ * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
36
+ * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
37
+ * case is significant in this parameter.
38
+ *
39
+ * @return NULL if the specified algorithm is not recognised, or the
40
+ * algorithm is not available for the requested encoding. Otherwise,
41
+ * returns a pointer to a newly created stemmer for the requested algorithm.
42
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
43
+ *
44
+ * @note NULL will also be returned if an out of memory error occurs.
45
+ */
46
+ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
47
+
48
+ /** Delete a stemmer object.
49
+ *
50
+ * This frees all resources allocated for the stemmer. After calling
51
+ * this function, the supplied stemmer may no longer be used in any way.
52
+ *
53
+ * It is safe to pass a null pointer to this function - this will have
54
+ * no effect.
55
+ */
56
+ void sb_stemmer_delete(struct sb_stemmer * stemmer);
57
+
58
+ /** Stem a word.
59
+ *
60
+ * The return value is owned by the stemmer - it must not be freed or
61
+ * modified, and it will become invalid when the stemmer is called again,
62
+ * or if the stemmer is freed.
63
+ *
64
+ * The length of the return value can be obtained using sb_stemmer_length().
65
+ *
66
+ * If an out-of-memory error occurs, this will return NULL.
67
+ */
68
+ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
69
+ const sb_symbol * word, int size);
70
+
71
+ /** Get the length of the result of the last stemmed word.
72
+ * This should not be called before sb_stemmer_stem() has been called.
73
+ */
74
+ int sb_stemmer_length(struct sb_stemmer * stemmer);
75
+
76
+ #ifdef __cplusplus
77
+ }
78
+ #endif
79
+
data/ext/mempool.c ADDED
@@ -0,0 +1,88 @@
1
+ #include "global.h"
2
+ #include "mempool.h"
3
+ #include <string.h>
4
+ #include "internal.h"
5
+
6
+ MemoryPool *mp_new_capa(int chuck_size, int init_buf_capa)
7
+ {
8
+ MemoryPool *mp = ALLOC(MemoryPool);
9
+ mp->chunk_size = chuck_size;
10
+ mp->buf_capa = init_buf_capa;
11
+ mp->buffers = ALLOC_N(char *, init_buf_capa);
12
+
13
+ mp->buffers[0] = mp->curr_buffer = (char *)emalloc(mp->chunk_size);
14
+ mp->buf_alloc = 1;
15
+ mp->buf_pointer = 0;
16
+ mp->pointer = 0;
17
+ return mp;
18
+ }
19
+
20
+ MemoryPool *mp_new()
21
+ {
22
+ return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
23
+ }
24
+
25
+ INLINE void *mp_alloc(MemoryPool *mp, int size)
26
+ {
27
+ char *p;
28
+ p = mp->curr_buffer + mp->pointer;
29
+ #if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
30
+ size = (((size - 1) >> 3) + 1) << 3;
31
+ #endif
32
+ mp->pointer += size;
33
+
34
+ if (mp->pointer > mp->chunk_size) {
35
+ mp->buf_pointer++;
36
+ if (mp->buf_pointer >= mp->buf_alloc) {
37
+ mp->buf_alloc++;
38
+ if (mp->buf_alloc >= mp->buf_capa) {
39
+ mp->buf_capa <<= 1;
40
+ REALLOC_N(mp->buffers, char *, mp->buf_capa);
41
+ }
42
+ mp->buffers[mp->buf_pointer] = (char *)emalloc(mp->chunk_size);
43
+ }
44
+ p = mp->curr_buffer = mp->buffers[mp->buf_pointer];
45
+ mp->pointer = size;
46
+ }
47
+ return p;
48
+ }
49
+
50
+ char *mp_strdup(MemoryPool *mp, const char *str)
51
+ {
52
+ int len = strlen(str) + 1;
53
+ return (char *)memcpy(mp_alloc(mp, len), str, len);
54
+ }
55
+
56
+ char *mp_strndup(MemoryPool *mp, const char *str, int len)
57
+ {
58
+ char *s = (char *)memcpy(mp_alloc(mp, len + 1), str, len);
59
+ s[len] = '\0';
60
+ return s;
61
+ }
62
+
63
+ void *mp_memdup(MemoryPool *mp, const void *p, int len)
64
+ {
65
+ return memcpy(mp_alloc(mp, len), p, len);
66
+ }
67
+
68
+ int mp_used(MemoryPool *mp)
69
+ {
70
+ return mp->buf_pointer * mp->chunk_size + mp->pointer;
71
+ }
72
+
73
+ void mp_reset(MemoryPool *mp)
74
+ {
75
+ mp->buf_pointer = 0;
76
+ mp->pointer = 0;
77
+ mp->curr_buffer = mp->buffers[0];
78
+ }
79
+
80
+ void mp_destroy(MemoryPool *mp)
81
+ {
82
+ int i;
83
+ for (i = 0; i < mp->buf_alloc; i++) {
84
+ free(mp->buffers[i]);
85
+ }
86
+ free(mp->buffers);
87
+ free(mp);
88
+ }
data/ext/mempool.h ADDED
@@ -0,0 +1,43 @@
1
+ #ifndef FRT_MEM_POOL_H
2
+ #define FRT_MEM_POOL_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #define FRT_MP_BUF_SIZE 65536
9
+ #define FRT_MP_INIT_CAPA 4
10
+
11
+ typedef struct FrtMemoryPool {
12
+ int buf_alloc;
13
+ int buf_capa;
14
+ int buf_pointer;
15
+ int pointer;
16
+ int chunk_size;
17
+ char *curr_buffer;
18
+ char **buffers;
19
+ } FrtMemoryPool;
20
+
21
+ extern FrtMemoryPool *frt_mp_new();
22
+ extern FrtMemoryPool *frt_mp_new_capa(int chunk_size, int init_capa);
23
+ extern FRT_INLINE void *frt_mp_alloc(FrtMemoryPool *mp, int size);
24
+ extern void frt_mp_reset(FrtMemoryPool *mp);
25
+ extern void frt_mp_destroy(FrtMemoryPool *mp);
26
+ extern char *frt_mp_strdup(FrtMemoryPool *mp, const char *str);
27
+ extern char *frt_mp_strndup(FrtMemoryPool *mp, const char *str, int len);
28
+ extern void *frt_mp_memdup(FrtMemoryPool *mp, const void *p, int len);
29
+ extern int frt_mp_used(FrtMemoryPool *mp);
30
+
31
+ #define FRT_MP_ALLOC_N(mp,type,n) (type *)frt_mp_alloc(mp, sizeof(type)*(n))
32
+ #define FRT_MP_ALLOC(mp,type) (type *)frt_mp_alloc(mp, sizeof(type))
33
+
34
+ #define FRT_MP_ALLOC_AND_ZERO(mp,type)\
35
+ (type*)memset(frt_mp_alloc(mp, sizeof(type)), 0, sizeof(type))
36
+ #define FRT_MP_ALLOC_AND_ZERO_N(mp,type,n)\
37
+ (type*)FRT_ZEROSET_N(frt_mp_alloc(mp, sizeof(type)*(n)), type, n)
38
+
39
+ #ifdef __cplusplus
40
+ } // extern "C"
41
+ #endif
42
+
43
+ #endif
data/ext/modules.h ADDED
@@ -0,0 +1,190 @@
1
+ /* libstemmer/modules.h: List of stemming modules.
2
+ *
3
+ * This file is generated by mkmodules.pl from a list of module names.
4
+ * Do not edit manually.
5
+ *
6
+ * Modules included by this file are: danish, dutch, english, finnish, french,
7
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ * russian, spanish, swedish, turkish
9
+ */
10
+
11
+ #include "stem_ISO_8859_1_danish.h"
12
+ #include "stem_UTF_8_danish.h"
13
+ #include "stem_ISO_8859_1_dutch.h"
14
+ #include "stem_UTF_8_dutch.h"
15
+ #include "stem_ISO_8859_1_english.h"
16
+ #include "stem_UTF_8_english.h"
17
+ #include "stem_ISO_8859_1_finnish.h"
18
+ #include "stem_UTF_8_finnish.h"
19
+ #include "stem_ISO_8859_1_french.h"
20
+ #include "stem_UTF_8_french.h"
21
+ #include "stem_ISO_8859_1_german.h"
22
+ #include "stem_UTF_8_german.h"
23
+ #include "stem_ISO_8859_1_hungarian.h"
24
+ #include "stem_UTF_8_hungarian.h"
25
+ #include "stem_ISO_8859_1_italian.h"
26
+ #include "stem_UTF_8_italian.h"
27
+ #include "stem_ISO_8859_1_norwegian.h"
28
+ #include "stem_UTF_8_norwegian.h"
29
+ #include "stem_ISO_8859_1_porter.h"
30
+ #include "stem_UTF_8_porter.h"
31
+ #include "stem_ISO_8859_1_portuguese.h"
32
+ #include "stem_UTF_8_portuguese.h"
33
+ #include "stem_ISO_8859_2_romanian.h"
34
+ #include "stem_UTF_8_romanian.h"
35
+ #include "stem_KOI8_R_russian.h"
36
+ #include "stem_UTF_8_russian.h"
37
+ #include "stem_ISO_8859_1_spanish.h"
38
+ #include "stem_UTF_8_spanish.h"
39
+ #include "stem_ISO_8859_1_swedish.h"
40
+ #include "stem_UTF_8_swedish.h"
41
+ #include "stem_UTF_8_turkish.h"
42
+
43
+ typedef enum {
44
+ ENC_UNKNOWN=0,
45
+ ENC_ISO_8859_1,
46
+ ENC_ISO_8859_2,
47
+ ENC_KOI8_R,
48
+ ENC_UTF_8
49
+ } stemmer_encoding_t;
50
+
51
+ struct stemmer_encoding {
52
+ const char * name;
53
+ stemmer_encoding_t enc;
54
+ };
55
+ static struct stemmer_encoding encodings[] = {
56
+ {"ISO_8859_1", ENC_ISO_8859_1},
57
+ {"ISO_8859_2", ENC_ISO_8859_2},
58
+ {"KOI8_R", ENC_KOI8_R},
59
+ {"UTF_8", ENC_UTF_8},
60
+ {0,ENC_UNKNOWN}
61
+ };
62
+
63
+ struct stemmer_modules {
64
+ const char * name;
65
+ stemmer_encoding_t enc;
66
+ struct SN_env * (*create)(void);
67
+ void (*close)(struct SN_env *);
68
+ int (*stem)(struct SN_env *);
69
+ };
70
+ static struct stemmer_modules modules[] = {
71
+ {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
72
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
73
+ {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
74
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
75
+ {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
76
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
77
+ {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
78
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
79
+ {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
80
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
81
+ {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
82
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
83
+ {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
85
+ {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
86
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
87
+ {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
88
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
89
+ {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
90
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
91
+ {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
92
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
93
+ {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
95
+ {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
96
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
97
+ {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
98
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
99
+ {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
100
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
101
+ {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
102
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
103
+ {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
104
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
105
+ {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
106
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
107
+ {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
108
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
109
+ {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
110
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
111
+ {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
112
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
+ {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
114
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
+ {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
116
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
+ {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
118
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
119
+ {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
120
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
121
+ {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
122
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
123
+ {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
124
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
125
+ {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
126
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
127
+ {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
128
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
129
+ {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
130
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
131
+ {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
132
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
133
+ {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
134
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
135
+ {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
136
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
137
+ {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
138
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
139
+ {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
140
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
141
+ {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
142
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
143
+ {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
144
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
145
+ {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
146
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
147
+ {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
148
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
149
+ {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
150
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
151
+ {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
152
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
153
+ {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
154
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
155
+ {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
156
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
157
+ {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
158
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
159
+ {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
160
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
161
+ {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
162
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
163
+ {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
164
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
165
+ {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
166
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
167
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
170
+ {0,ENC_UNKNOWN,0,0,0}
171
+ };
172
+ static const char * algorithm_names[] = {
173
+ "danish",
174
+ "dutch",
175
+ "english",
176
+ "finnish",
177
+ "french",
178
+ "german",
179
+ "hungarian",
180
+ "italian",
181
+ "norwegian",
182
+ "porter",
183
+ "portuguese",
184
+ "romanian",
185
+ "russian",
186
+ "spanish",
187
+ "swedish",
188
+ "turkish",
189
+ 0
190
+ };