ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
data/ext/lang.h CHANGED
@@ -2,6 +2,7 @@
2
2
  #define FRT_LANG_H
3
3
 
4
4
  #include <ruby.h>
5
+ #include "hash.h"
5
6
 
6
7
  #define FERRET_EXT
7
8
 
@@ -14,28 +15,37 @@ extern void setprogname(const char *str);
14
15
 
15
16
  extern VALUE cQueryParseException;
16
17
 
17
- #define ERROR rb_eException
18
- #define IO_ERROR rb_eIOError
19
- #define ARG_ERROR rb_eArgError
20
- #define EOF_ERROR rb_eEOFError
21
- #define UNSUPPORTED_ERROR rb_eNotImpError
22
- #define STATE_ERROR rb_eException
23
- #define PARSE_ERROR cQueryParseException
24
- #define MEM_ERROR rb_eNoMemError
18
+ #define EXCEPTION_CODE rb_eException
19
+ //#define IO_ERROR rb_eIOError
20
+ //#define ARG_ERROR rb_eArgError
21
+ //#define EOF_ERROR rb_eEOFError
22
+ //#define UNSUPPORTED_ERROR rb_eNotImpError
23
+ //#define STATE_ERROR rb_eException
24
+ //#define PARSE_ERROR cQueryParseException
25
+ //#define MEM_ERROR rb_eNoMemError
25
26
 
26
27
  typedef void * mutex_t;
27
- typedef void * thread_key_t;
28
+ typedef struct HshTable * thread_key_t;
29
+ typedef int thread_once_t;
28
30
  #define MUTEX_INITIALIZER NULL
29
31
  #define MUTEX_RECURSIVE_INITIALIZER NULL
32
+ #define THREAD_ONCE_INIT 1;
30
33
  #define mutex_init(a, b)
31
34
  #define mutex_lock(a)
32
35
  #define mutex_trylock(a)
33
36
  #define mutex_unlock(a)
34
37
  #define mutex_destroy(a)
35
- #define thread_key_create(a, b)
36
- #define thread_key_delete(a)
37
- #define thread_setspecific(a, b)
38
- #define thread_getspecific(a) NULL
38
+ #define thread_key_create(a, b) frt_thread_key_create(a, b)
39
+ #define thread_key_delete(a) frt_thread_key_delete(a)
40
+ #define thread_setspecific(a, b) frt_thread_setspecific(a, b)
41
+ #define thread_getspecific(a) frt_thread_getspecific(a)
39
42
  #define thread_exit(a)
43
+ #define thread_once(a, b) frt_thread_once(a, b)
44
+
45
+ void frt_thread_once(int *once_control, void (*init_routine) (void));
46
+ void frt_thread_key_create(thread_key_t *key, void (*destr_function) (void *));
47
+ void frt_thread_key_delete(thread_key_t key);
48
+ void frt_thread_setspecific(thread_key_t key, const void *pointer);
49
+ void *frt_thread_getspecific(thread_key_t key);
40
50
 
41
51
  #endif
@@ -0,0 +1,92 @@
1
+
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "libstemmer.h"
5
+ #include "api.h"
6
+ #include "modules.h"
7
+
8
+ struct sb_stemmer {
9
+ struct SN_env * (*create)(void);
10
+ void (*close)(struct SN_env *);
11
+ int (*stem)(struct SN_env *);
12
+
13
+ struct SN_env * env;
14
+ };
15
+
16
+ extern const char **
17
+ sb_stemmer_list(void)
18
+ {
19
+ return algorithm_names;
20
+ }
21
+
22
+ static stemmer_encoding sb_getenc(const char * charenc)
23
+ {
24
+ struct stemmer_encoding * encoding;
25
+ if (charenc == NULL) return ENC_UTF_8;
26
+ for (encoding = encodings; encoding->name != 0; encoding++) {
27
+ if (strcmp(encoding->name, charenc) == 0) break;
28
+ }
29
+ if (encoding->name == NULL) return ENC_UNKNOWN;
30
+ return encoding->enc;
31
+ }
32
+
33
+ extern struct sb_stemmer *
34
+ sb_stemmer_new(const char * algorithm, const char * charenc)
35
+ {
36
+ stemmer_encoding enc;
37
+ struct stemmer_modules * module;
38
+ struct sb_stemmer * stemmer =
39
+ (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
40
+ if (stemmer == NULL) return NULL;
41
+ enc = sb_getenc(charenc);
42
+ if (enc == ENC_UNKNOWN) return NULL;
43
+
44
+ for (module = modules; module->name != 0; module++) {
45
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46
+ }
47
+ if (module->name == NULL) return NULL;
48
+
49
+ stemmer->create = module->create;
50
+ stemmer->close = module->close;
51
+ stemmer->stem = module->stem;
52
+
53
+ stemmer->env = stemmer->create();
54
+ if (stemmer->env == NULL)
55
+ {
56
+ sb_stemmer_delete(stemmer);
57
+ return NULL;
58
+ }
59
+
60
+ return stemmer;
61
+ }
62
+
63
+ void
64
+ sb_stemmer_delete(struct sb_stemmer * stemmer)
65
+ {
66
+ if (stemmer == 0) return;
67
+ if (stemmer->close == 0) return;
68
+ stemmer->close(stemmer->env);
69
+ stemmer->close = 0;
70
+ free(stemmer);
71
+ }
72
+
73
+ const sb_symbol *
74
+ sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
75
+ {
76
+ int ret;
77
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
78
+ {
79
+ stemmer->env->l = 0;
80
+ return NULL;
81
+ }
82
+ ret = stemmer->stem(stemmer->env);
83
+ if (ret < 0) return NULL;
84
+ stemmer->env->p[stemmer->env->l] = 0;
85
+ return (const sb_symbol *)(stemmer->env->p);
86
+ }
87
+
88
+ int
89
+ sb_stemmer_length(struct sb_stemmer * stemmer)
90
+ {
91
+ return stemmer->env->l;
92
+ }
@@ -0,0 +1,79 @@
1
+
2
+ /* Make header file work when included from C++ */
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ struct sb_stemmer;
8
+ typedef unsigned char sb_symbol;
9
+
10
+ /* FIXME - should be able to get a version number for each stemming
11
+ * algorithm (which will be incremented each time the output changes). */
12
+
13
+ /** Returns an array of the names of the available stemming algorithms.
14
+ * Note that these are the canonical names - aliases (ie, other names for
15
+ * the same algorithm) will not be included in the list.
16
+ * The list is terminated with a null pointer.
17
+ *
18
+ * The list must not be modified in any way.
19
+ */
20
+ const char ** sb_stemmer_list(void);
21
+
22
+ /** Create a new stemmer object, using the specified algorithm, for the
23
+ * specified character encoding.
24
+ *
25
+ * All algorithms will usually be available in UTF-8, but may also be
26
+ * available in other character encodings.
27
+ *
28
+ * @param algorithm The algorithm name. This is either the english
29
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
30
+ * language. Note that case is significant in this parameter - the
31
+ * value should be supplied in lower case.
32
+ *
33
+ * @param charenc The character encoding. NULL may be passed as
34
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
35
+ * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
36
+ * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
37
+ * case is significant in this parameter.
38
+ *
39
+ * @return NULL if the specified algorithm is not recognised, or the
40
+ * algorithm is not available for the requested encoding. Otherwise,
41
+ * returns a pointer to a newly created stemmer for the requested algorithm.
42
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
43
+ *
44
+ * @note NULL will also be returned if an out of memory error occurs.
45
+ */
46
+ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
47
+
48
+ /** Delete a stemmer object.
49
+ *
50
+ * This frees all resources allocated for the stemmer. After calling
51
+ * this function, the supplied stemmer may no longer be used in any way.
52
+ *
53
+ * It is safe to pass a null pointer to this function - this will have
54
+ * no effect.
55
+ */
56
+ void sb_stemmer_delete(struct sb_stemmer * stemmer);
57
+
58
+ /** Stem a word.
59
+ *
60
+ * The return value is owned by the stemmer - it must not be freed or
61
+ * modified, and it will become invalid when the stemmer is called again,
62
+ * or if the stemmer is freed.
63
+ *
64
+ * The length of the return value can be obtained using sb_stemmer_length().
65
+ *
66
+ * If an out-of-memory error occurs, this will return NULL.
67
+ */
68
+ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
69
+ const sb_symbol * word, int size);
70
+
71
+ /** Get the length of the result of the last stemmed word.
72
+ * This should not be called before sb_stemmer_stem() has been called.
73
+ */
74
+ int sb_stemmer_length(struct sb_stemmer * stemmer);
75
+
76
+ #ifdef __cplusplus
77
+ }
78
+ #endif
79
+
@@ -0,0 +1,162 @@
1
+ /* libstemmer/modules.h: List of stemming modules.
2
+ *
3
+ * This file is generated by mkmodules.pl from a list of module names.
4
+ * Do not edit manually.
5
+ *
6
+ * Modules included by this file are: danish, dutch, english, finnish, french,
7
+ * german, italian, norwegian, porter, portuguese, russian, spanish, swedish
8
+ */
9
+
10
+ #include "stem_ISO_8859_1_danish.h"
11
+ #include "stem_UTF_8_danish.h"
12
+ #include "stem_ISO_8859_1_dutch.h"
13
+ #include "stem_UTF_8_dutch.h"
14
+ #include "stem_ISO_8859_1_english.h"
15
+ #include "stem_UTF_8_english.h"
16
+ #include "stem_ISO_8859_1_finnish.h"
17
+ #include "stem_UTF_8_finnish.h"
18
+ #include "stem_ISO_8859_1_french.h"
19
+ #include "stem_UTF_8_french.h"
20
+ #include "stem_ISO_8859_1_german.h"
21
+ #include "stem_UTF_8_german.h"
22
+ #include "stem_ISO_8859_1_italian.h"
23
+ #include "stem_UTF_8_italian.h"
24
+ #include "stem_ISO_8859_1_norwegian.h"
25
+ #include "stem_UTF_8_norwegian.h"
26
+ #include "stem_ISO_8859_1_porter.h"
27
+ #include "stem_UTF_8_porter.h"
28
+ #include "stem_ISO_8859_1_portuguese.h"
29
+ #include "stem_UTF_8_portuguese.h"
30
+ #include "stem_KOI8_R_russian.h"
31
+ #include "stem_UTF_8_russian.h"
32
+ #include "stem_ISO_8859_1_spanish.h"
33
+ #include "stem_UTF_8_spanish.h"
34
+ #include "stem_ISO_8859_1_swedish.h"
35
+ #include "stem_UTF_8_swedish.h"
36
+
37
+ typedef enum {
38
+ ENC_UNKNOWN,
39
+ ENC_ISO_8859_1,
40
+ ENC_KOI8_R,
41
+ ENC_UTF_8
42
+ } stemmer_encoding;
43
+
44
+ struct stemmer_encoding {
45
+ const char * name;
46
+ stemmer_encoding enc;
47
+ };
48
+ static struct stemmer_encoding encodings[] = {
49
+ {"ISO_8859_1", ENC_ISO_8859_1},
50
+ {"KOI8_R", ENC_KOI8_R},
51
+ {"UTF_8", ENC_UTF_8},
52
+ {0,0}
53
+ };
54
+
55
+ struct stemmer_modules {
56
+ const char * name;
57
+ stemmer_encoding enc;
58
+ struct SN_env * (*create)(void);
59
+ void (*close)(struct SN_env *);
60
+ int (*stem)(struct SN_env *);
61
+ };
62
+ static struct stemmer_modules modules[] = {
63
+ {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
64
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
65
+ {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
66
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
67
+ {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
68
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
69
+ {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
70
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
71
+ {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
72
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
73
+ {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
74
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
75
+ {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
76
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
77
+ {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
78
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
79
+ {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
80
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
81
+ {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
82
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
83
+ {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
84
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
85
+ {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
86
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
87
+ {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
88
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
89
+ {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
90
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
91
+ {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
92
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
93
+ {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
94
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
95
+ {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
96
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
97
+ {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
98
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
99
+ {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
100
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
101
+ {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
102
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
103
+ {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
104
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
105
+ {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
106
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
107
+ {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
108
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
109
+ {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
110
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
111
+ {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
112
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
113
+ {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
114
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
115
+ {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
116
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
117
+ {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
118
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
119
+ {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
120
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
121
+ {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
122
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
123
+ {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
124
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
125
+ {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
126
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
127
+ {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
128
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
129
+ {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
130
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
131
+ {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
132
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
133
+ {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
134
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
135
+ {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
136
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
137
+ {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
138
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
139
+ {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
140
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
141
+ {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
142
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
143
+ {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
144
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
145
+ {0,0,0,0,0}
146
+ };
147
+ static const char * algorithm_names[] = {
148
+ "danish",
149
+ "dutch",
150
+ "english",
151
+ "finnish",
152
+ "french",
153
+ "german",
154
+ "italian",
155
+ "norwegian",
156
+ "porter",
157
+ "portuguese",
158
+ "russian",
159
+ "spanish",
160
+ "swedish",
161
+ 0
162
+ };
@@ -1,6 +1,12 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
 
4
+ static char * const INVALID_BC_ERROR_MSG = "Invalid value for BooleanClause Type";
5
+ static char * const TOO_MANY_CLAUSES_ERROR_MSG = "Too many clauses";
6
+ static char * const MIN_NUM_MATCHES_ERROR_MSG = "Minimum nr of matches must be positive";
7
+ static char * const TWO_SUB_ERROR_MSG = "There must be at least 2 sub_scorers";
8
+ static char * const UNKNOWN_OCCUR_VAL_ERROR_MSG = "Unknown value for occur";
9
+
4
10
  /***************************************************************************
5
11
  *
6
12
  * BooleanWeight
@@ -70,9 +76,7 @@ Scorer *bw_scorer(Weight *self, IndexReader *ir)
70
76
 
71
77
  char *bw_to_s(Weight *self)
72
78
  {
73
- char dbuf[32];
74
- dbl_to_s(dbuf, self->value);
75
- return epstrdup("BooleanWeight(%s)", strlen(dbuf), dbuf);
79
+ return strfmt("BooleanWeight(%f)", self->value);
76
80
  }
77
81
 
78
82
  void bw_destroy(void *p)
@@ -138,7 +142,7 @@ Explanation *bw_explain(Weight *self, IndexReader *ir, int doc_num)
138
142
  explanation = expl_create(sum * coord_factor, estrdup("product of:"));
139
143
  expl_add_detail(explanation, sum_expl);
140
144
  expl_add_detail(explanation, expl_create(coord_factor,
141
- epstrdup("coord(%d/%d)", 40, coord, max_coord)));
145
+ strfmt("coord(%d/%d)", coord, max_coord)));
142
146
  return explanation;
143
147
  }
144
148
  }
@@ -196,10 +200,16 @@ void bc_set_occur(BooleanClause *self, unsigned int occur)
196
200
  self->is_required = false;
197
201
  break;
198
202
  default:
199
- eprintf(ARG_ERROR, "Invalid value %d for BooleanClause Type", occur);
203
+ RAISE(ARG_ERROR, INVALID_BC_ERROR_MSG);
200
204
  }
201
205
  }
202
206
 
207
+ void bc_destroy(BooleanClause *self)
208
+ {
209
+ self->query->destroy(self->query);
210
+ free(self);
211
+ }
212
+
203
213
  BooleanClause *bc_create(Query *query, unsigned int occur)
204
214
  {
205
215
  BooleanClause *self = ALLOC(BooleanClause);
@@ -312,9 +322,7 @@ char *bq_to_s(Query *self, char *field)
312
322
  }
313
323
 
314
324
  if (self->boost != 1.0) {
315
- char dbuf[32];
316
- dbl_to_s(dbuf, self->boost);
317
- char *boost_str = epstrdup(")^%s", strlen(dbuf), dbuf);
325
+ char *boost_str = strfmt(")^%f", self->boost);
318
326
  int boost_len = strlen(boost_str);
319
327
  REALLOC_N(buffer, char, bp + boost_len + 1);
320
328
  memcpy(buffer + bp, boost_str, sizeof(char) * boost_len);
@@ -329,12 +337,11 @@ void bq_destroy(void *p)
329
337
  {
330
338
  Query *self = (Query *)p;
331
339
  BooleanQuery *bq = (BooleanQuery *)self->data;
332
- BooleanClause *clause;
333
340
  int i;
334
- for (i = 0; i < bq->clause_cnt; i++) {
335
- clause = bq->clauses[i];
336
- if (self->destroy_all) clause->query->destroy(clause->query);
337
- free(clause);
341
+ if (self->destroy_all) {
342
+ for (i = 0; i < bq->clause_cnt; i++) {
343
+ bc_destroy(bq->clauses[i]);
344
+ }
338
345
  }
339
346
  free(bq->clauses);
340
347
  if (bq->similarity) {
@@ -387,19 +394,25 @@ Query *bq_create(bool coord_disabled)
387
394
  return self;
388
395
  }
389
396
 
390
- void bq_add_query(Query *self, Query *sub_query, unsigned int occur)
397
+ BooleanClause *bq_add_clause(Query *self, BooleanClause *bc)
391
398
  {
392
399
  BooleanQuery *bq = (BooleanQuery *)self->data;
393
- BooleanClause *bc = bc_create(sub_query, occur);
394
400
  if (bq->clause_cnt >= bq->clause_capa) {
395
401
  bq->clause_capa *= 2;
396
402
  REALLOC_N(bq->clauses, BooleanClause *, bq->clause_capa);
397
403
  }
398
404
  if (bq->clause_cnt > bq->max_clause_cnt) {
399
- eprintf(STATE_ERROR, "Too many clauses.");
405
+ RAISE(STATE_ERROR, TOO_MANY_CLAUSES_ERROR_MSG);
400
406
  }
401
407
  bq->clauses[bq->clause_cnt] = bc;
402
408
  bq->clause_cnt++;
409
+ return bc;
410
+ }
411
+
412
+ BooleanClause *bq_add_query(Query *self, Query *sub_query, unsigned int occur)
413
+ {
414
+ BooleanClause *bc = bc_create(sub_query, occur);
415
+ return bq_add_clause(self, bc);
403
416
  }
404
417
 
405
418
  /***************************************************************************
@@ -551,7 +564,7 @@ Explanation *dssc_explain(Scorer *self, int doc_num)
551
564
  DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
552
565
  Scorer *sub_scorer;
553
566
  Explanation *e = expl_create(0.0,
554
- epstrdup("At least %d of:", 20, dssc->min_num_matches));
567
+ strfmt("At least %d of:", dssc->min_num_matches));
555
568
  for (i = 0; i < dssc->ss_cnt; i++) {
556
569
  sub_scorer = dssc->sub_scorers[i];
557
570
  expl_add_detail(e, sub_scorer->explain(sub_scorer, doc_num));
@@ -567,7 +580,7 @@ void dssc_destroy(void *p)
567
580
  for (i = 0; i < dssc->ss_cnt; i++) {
568
581
  dssc->sub_scorers[i]->destroy(dssc->sub_scorers[i]);
569
582
  }
570
- pq_destroy(dssc->scorer_queue);
583
+ if (dssc->scorer_queue) pq_destroy(dssc->scorer_queue);
571
584
  scorer_destroy(self);
572
585
  }
573
586
 
@@ -588,10 +601,10 @@ Scorer *disjunction_sum_scorer_create(Scorer **sub_scorers, int ss_cnt,
588
601
  dssc->coordinator = NULL;
589
602
 
590
603
  if (min_num_matches <= 0) {
591
- eprintf(ARG_ERROR, "Minimum nr of matches must be positive");
604
+ RAISE(ARG_ERROR, MIN_NUM_MATCHES_ERROR_MSG);
592
605
  }
593
606
  if (ss_cnt <= 1) {
594
- eprintf(ARG_ERROR, "There must be at least 2 sub_scorers");
607
+ RAISE(ARG_ERROR, TWO_SUB_ERROR_MSG);
595
608
  }
596
609
 
597
610
  dssc->min_num_matches = min_num_matches;
@@ -1231,7 +1244,7 @@ void bsc_add_scorer(Scorer *self, Scorer *scorer, unsigned int occur)
1231
1244
  bsc->prohibited_scorers[bsc->ps_cnt++] = scorer;
1232
1245
  break;
1233
1246
  default:
1234
- eprintf(ARG_ERROR, "Unknown value for occur <%d>\n", occur);
1247
+ RAISE(ARG_ERROR, UNKNOWN_OCCUR_VAL_ERROR_MSG);
1235
1248
  }
1236
1249
  }
1237
1250