ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
@@ -1,6 +1,9 @@
1
1
  #include <string.h>
2
2
  #include <store.h>
3
3
 
4
+ static char * const RENAME_ERROR_MSG = "tried to rename a file that doesn't exist";
5
+ static char * const MISSING_RAMFILE_ERROR_MSG ="Couldn't open the ram file to read";
6
+
4
7
  typedef struct RamFile {
5
8
  char *name;
6
9
  uchar **buffers;
@@ -74,7 +77,7 @@ int ram_rename(Store *store, char *from, char *to)
74
77
  {
75
78
  RamFile *rf = (RamFile *)h_rem(store->dir.ht, from, false);
76
79
  if (rf == NULL)
77
- eprintf(IO_ERROR, "tried to rename a file that doesn't exist");
80
+ RAISE(IO_ERROR, RENAME_ERROR_MSG);
78
81
 
79
82
  free(rf->name);
80
83
 
@@ -335,7 +338,7 @@ InStream *ram_open_input(Store *store, const char *filename)
335
338
  {
336
339
  RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
337
340
  if (rf == NULL) {
338
- eprintf(IO_ERROR, "Couldn't open the ram file %s to read", filename);
341
+ RAISE(IO_ERROR, MISSING_RAMFILE_ERROR_MSG);
339
342
  }
340
343
  rf->refcnt++;
341
344
  InStream *is = is_create();
@@ -1,6 +1,9 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
 
4
+ static char * const NUM_DOCS_ARG_ERROR_MSG = "num_docs must be > 0 to run a search";
5
+ static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a search";
6
+
4
7
  /***************************************************************************
5
8
  *
6
9
  * Explanation
@@ -44,15 +47,11 @@ Explanation *expl_add_detail(Explanation *self, Explanation *detail)
44
47
  char *expl_to_s(Explanation *self, int depth)
45
48
  {
46
49
  int i;
47
- char dbuf[32];
48
50
  char *buffer = ALLOC_N(char, depth * 2 + 1);
49
51
  memset(buffer, ' ', sizeof(char) * depth * 2);
50
52
  buffer[depth*2] = 0;
51
53
 
52
- dbl_to_s(dbuf, self->value);
53
- buffer = estrcat(buffer, epstrdup("%s = %s\n",
54
- strlen(dbuf) + strlen(self->description),
55
- dbuf, self->description));
54
+ buffer = estrcat(buffer, strfmt("%f = %s\n", self->value, self->description));
56
55
  for (i = 0; i < self->dcnt; i++) {
57
56
  buffer = estrcat(buffer, expl_to_s(self->details[i], depth + 1));
58
57
  }
@@ -63,12 +62,8 @@ char *expl_to_s(Explanation *self, int depth)
63
62
  char *expl_to_html(Explanation *self)
64
63
  {
65
64
  int i;
66
- char dbuf[32];
67
65
  char *buffer;
68
- dbl_to_s(dbuf, self->value);
69
- buffer = epstrdup("<ul>\n<li>%s = %s</li>\n",
70
- strlen(dbuf) + strlen(self->description),
71
- dbuf, self->description);
66
+ buffer = strfmt("<ul>\n<li>%f = %s</li>\n", self->value, self->description);
72
67
 
73
68
  for (i = 0; i < self->dcnt; i++) {
74
69
  estrcat(buffer, expl_to_html(self->details[i]));
@@ -193,13 +188,11 @@ void td_destroy(void *p)
193
188
  char *td_to_s(TopDocs *td)
194
189
  {
195
190
  int i;
196
- char dbuf[32];
197
191
  Hit *hit;
198
- char *buffer = epstrdup("%d hits sorted by <score, doc_num>\n", 20, td->total_hits);
192
+ char *buffer = strfmt("%d hits sorted by <score, doc_num>\n", td->total_hits);
199
193
  for (i = 0; i < td->size; i++) {
200
194
  hit = td->hits[i];
201
- dbl_to_s(dbuf, hit->score);
202
- estrcat(buffer, epstrdup("\t%d:%s\n", 52, hit->doc, dbuf));
195
+ estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
203
196
  }
204
197
  return buffer;
205
198
  }
@@ -389,10 +382,10 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
389
382
 
390
383
 
391
384
  if (num_docs <= 0)
392
- eprintf(ARG_ERROR, "num_docs must be > 0 to run a search");
385
+ RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
393
386
 
394
387
  if (first_doc < 0)
395
- eprintf(ARG_ERROR, "first_doc must be >= 0 to run a search");
388
+ RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
396
389
 
397
390
  weight = q_weight(query, self);
398
391
  scorer = weight->scorer(weight, self->ir);
@@ -498,7 +491,7 @@ Similarity *sea_get_similarity(Searcher *self)
498
491
 
499
492
  void sea_close(Searcher *self)
500
493
  {
501
- if (self->ir)
494
+ if (self->ir && self->close_ir)
502
495
  ir_close(self->ir);
503
496
  free(self);
504
497
  }
@@ -507,6 +500,7 @@ Searcher *sea_create(IndexReader *ir)
507
500
  {
508
501
  Searcher *self = ALLOC(Searcher);
509
502
  self->ir = ir;
503
+ self->close_ir = true;
510
504
  self->similarity = sim_create_default();
511
505
  self->doc_freq = &sea_doc_freq;
512
506
  self->doc_freqs = &sea_doc_freqs;
@@ -1,53 +1,13 @@
1
1
  #ifndef FRT_SEARCH_H
2
2
  #define FRT_SEARCH_H
3
3
 
4
- typedef struct Similarity Similarity;
5
4
  typedef struct Query Query;
6
5
  typedef struct Weight Weight;
7
6
  typedef struct Scorer Scorer;
8
- typedef struct Searcher Searcher;
9
7
 
10
8
  #include "index.h"
11
9
  #include "bitvector.h"
12
-
13
- /***************************************************************************
14
- *
15
- * Similarity
16
- *
17
- ***************************************************************************/
18
-
19
- struct Similarity {
20
- void *data;
21
- float norm_table[256];
22
- float (*length_norm)(Similarity *self, char *field, int num_terms);
23
- float (*query_norm)(Similarity *self, float sum_of_squared_weights);
24
- float (*tf)(Similarity *self, float freq);
25
- float (*sloppy_freq)(Similarity *self, int distance);
26
- float (*idf_term)(Similarity *self, Term *term, Searcher *searcher);
27
- float (*idf_phrase)(Similarity *self, Term **terms, int tcnt, Searcher *searcher);
28
- float (*idf)(Similarity *self, int doc_freq, int num_docs);
29
- float (*coord)(Similarity *self, int overlap, int max_overlap);
30
- float (*decode_norm)(Similarity *self, uchar b);
31
- float (*encode_norm)(Similarity *self, float f);
32
- void (*destroy)(void *p);
33
- };
34
-
35
- #define sim_length_norm(msim, field, num_terms) msim->length_norm(msim, field, num_terms)
36
- #define sim_query_norm(msim, sosw) msim->query_norm(msim, sosw)
37
- #define sim_tf(msim, freq) msim->tf(msim, freq)
38
- #define sim_sloppy_freq(msim, distance) msim->sloppy_freq(msim, distance)
39
- #define sim_idf_term(msim, term, searcher) msim->idf_term(msim, term, searcher)
40
- #define sim_idf_phrase(msim, terms, tcnt, searcher) msim->idf_phrase(msim, terms, tcnt, searcher)
41
- #define sim_idf(msim, doc_freq, num_docs) msim->idf(msim, doc_freq, num_docs)
42
- #define sim_coord(msim, overlap, max_overlap) msim->coord(msim, overlap, max_overlap)
43
- #define sim_decode_norm(msim, b) msim->decode_norm(msim, b)
44
- #define sim_encode_norm(msim, f) msim->encode_norm(msim, f)
45
- #define sim_destroy(msim) msim->destroy(msim)
46
-
47
- float byte_to_float(uchar b);
48
- uchar float_to_byte(float f);
49
-
50
- Similarity *sim_create_default();
10
+ #include "similarity.h"
51
11
 
52
12
  /***************************************************************************
53
13
  *
@@ -235,6 +195,7 @@ enum QUERY_TYPE {
235
195
  PHRASE_QUERY,
236
196
  MULTI_PHRASE_QUERY,
237
197
  CONSTANT_QUERY,
198
+ FILTERED_QUERY,
238
199
  MATCH_ALL_QUERY,
239
200
  RANGE_QUERY,
240
201
  WILD_CARD_QUERY,
@@ -306,6 +267,7 @@ typedef struct BooleanClause {
306
267
  } BooleanClause;
307
268
 
308
269
  BooleanClause *bc_create(Query *query, unsigned int occur);
270
+ void bc_destroy(BooleanClause *self);
309
271
  void bc_set_occur(BooleanClause *self, unsigned int occur);
310
272
 
311
273
  /***************************************************************************
@@ -326,7 +288,8 @@ typedef struct BooleanQuery {
326
288
  } BooleanQuery;
327
289
 
328
290
  Query *bq_create(bool coord_disabled);
329
- void bq_add_query(Query *self, Query *sub_query, unsigned int occur);
291
+ BooleanClause *bq_add_query(Query *self, Query *sub_query, unsigned int occur);
292
+ BooleanClause *bq_add_clause(Query *self, BooleanClause *bc);
330
293
 
331
294
  /***************************************************************************
332
295
  *
@@ -420,15 +383,15 @@ Query *csq_create(Filter *filter);
420
383
 
421
384
  /***************************************************************************
422
385
  *
423
- * MatchAllQuery
386
+ * FilteredQueryQuery
424
387
  *
425
388
  ***************************************************************************/
426
389
 
427
- Query *maq_create();
390
+ Query *fq_create(Query *query, Filter *filter);
428
391
 
429
392
  /***************************************************************************
430
393
  *
431
- * ConstantScoreQuery
394
+ * MatchAllQuery
432
395
  *
433
396
  ***************************************************************************/
434
397
 
@@ -453,6 +416,17 @@ Query *rq_create(const char *field, char *lower_term, char *upper_term,
453
416
  Query *rq_create_less(const char *field, char *upper_term, bool include_upper);
454
417
  Query *rq_create_more(const char *field, char *lower_term, bool include_lower);
455
418
 
419
+ /***************************************************************************
420
+ *
421
+ * FilteredQuery
422
+ *
423
+ ***************************************************************************/
424
+
425
+ typedef struct FilteredQuery {
426
+ Query *query;
427
+ Filter *filter;
428
+ } FilteredQuery;
429
+
456
430
  /***************************************************************************
457
431
  *
458
432
  * SpanQuery
@@ -949,6 +923,7 @@ PriorityQueue *fshq_pq_create(int size, Sort *sort, IndexReader *ir);
949
923
  struct Searcher {
950
924
  IndexReader *ir;
951
925
  Similarity *similarity;
926
+ bool close_ir : 1;
952
927
  int (*doc_freq)(Searcher *self, Term *term);
953
928
  int *(*doc_freqs)(Searcher *self, Term **terms, int tcnt);
954
929
  Document *(*get_doc)(Searcher *self, int doc_num);
@@ -1032,6 +1007,7 @@ typedef struct Index {
1032
1007
  bool use_compound_file : 1;
1033
1008
  bool auto_flush : 1;
1034
1009
  bool has_writes : 1;
1010
+ bool check_latest : 1;
1035
1011
  } Index;
1036
1012
 
1037
1013
  Index *index_create(Store *store, Analyzer *analyzer, HashSet *def_fields,
@@ -0,0 +1,67 @@
1
+ #ifndef FRT_SIMILARITY_H
2
+ #define FRT_SIMILARITY_H
3
+
4
+ typedef struct Searcher Searcher;
5
+
6
+ /****************************************************************************
7
+ *
8
+ * Term
9
+ *
10
+ ****************************************************************************/
11
+
12
+ typedef struct Term {
13
+ char *field;
14
+ char *text;
15
+ } Term;
16
+
17
+ Term *term_clone(Term *term);
18
+ Term *term_create(const char *field, char *text);
19
+ void term_destroy(void *p);
20
+ int term_cmp(void *t1, void *t2);
21
+ int term_eq(const void *t1, const void *t2);
22
+ unsigned int term_hash(const void *t);
23
+ char *term_to_s(Term *term);
24
+
25
+ /***************************************************************************
26
+ *
27
+ * Similarity
28
+ *
29
+ ***************************************************************************/
30
+
31
+ typedef struct Similarity Similarity;
32
+
33
+ struct Similarity {
34
+ void *data;
35
+ float norm_table[256];
36
+ float (*length_norm)(Similarity *self, char *field, int num_terms);
37
+ float (*query_norm)(Similarity *self, float sum_of_squared_weights);
38
+ float (*tf)(Similarity *self, float freq);
39
+ float (*sloppy_freq)(Similarity *self, int distance);
40
+ float (*idf_term)(Similarity *self, Term *term, Searcher *searcher);
41
+ float (*idf_phrase)(Similarity *self, Term **terms,
42
+ int tcnt, Searcher *searcher);
43
+ float (*idf)(Similarity *self, int doc_freq, int num_docs);
44
+ float (*coord)(Similarity *self, int overlap, int max_overlap);
45
+ float (*decode_norm)(Similarity *self, uchar b);
46
+ float (*encode_norm)(Similarity *self, float f);
47
+ void (*destroy)(void *p);
48
+ };
49
+
50
+ #define sim_length_norm(msim, field, num_terms) msim->length_norm(msim, field, num_terms)
51
+ #define sim_query_norm(msim, sosw) msim->query_norm(msim, sosw)
52
+ #define sim_tf(msim, freq) msim->tf(msim, freq)
53
+ #define sim_sloppy_freq(msim, distance) msim->sloppy_freq(msim, distance)
54
+ #define sim_idf_term(msim, term, searcher) msim->idf_term(msim, term, searcher)
55
+ #define sim_idf_phrase(msim, terms, tcnt, searcher) msim->idf_phrase(msim, terms, tcnt, searcher)
56
+ #define sim_idf(msim, doc_freq, num_docs) msim->idf(msim, doc_freq, num_docs)
57
+ #define sim_coord(msim, overlap, max_overlap) msim->coord(msim, overlap, max_overlap)
58
+ #define sim_decode_norm(msim, b) msim->decode_norm(msim, b)
59
+ #define sim_encode_norm(msim, f) msim->encode_norm(msim, f)
60
+ #define sim_destroy(msim) msim->destroy(msim)
61
+
62
+ float byte_to_float(uchar b);
63
+ uchar float_to_byte(float f);
64
+
65
+ Similarity *sim_create_default();
66
+
67
+ #endif
data/ext/sort.c CHANGED
@@ -2,6 +2,8 @@
2
2
  #include "search.h"
3
3
  #include "index.h"
4
4
 
5
+ static char * const NO_TERM_ERROR_MSG = "no terms in field to sort by";
6
+
5
7
  /***************************************************************************
6
8
  *
7
9
  * SortField
@@ -378,8 +380,8 @@ void *field_cache_get_index(IndexReader *ir, SortField *sf)
378
380
  int length = 0;
379
381
  Term term;
380
382
  TermBuffer *tb;
381
- TermEnum *te;
382
- TermDocEnum *tde;
383
+ TermEnum *volatile te = NULL;
384
+ TermDocEnum *volatile tde = NULL;
383
385
  char *field = sf->field;
384
386
  SortField *sf_clone;
385
387
 
@@ -393,29 +395,32 @@ void *field_cache_get_index(IndexReader *ir, SortField *sf)
393
395
  if (index == NULL) {
394
396
  length = ir->max_doc(ir);
395
397
  if (length > 0) {
396
- tde = ir->term_docs(ir);
397
- term.field = field;
398
- term.text = "";
399
- te = ir->terms_from(ir, &term);
400
- if (te->tb_curr == NULL) {
401
- eprintf(ARG_ERROR, "no terms in field '%s' to sort by", field);
402
- }
403
-
404
- if (sf->type == SORT_TYPE_AUTO) {
405
- sort_field_auto_evaluate(sf, te->tb_curr->text);
406
- }
407
-
408
- index = sf->create_index(length);
409
-
410
- do {
411
- tb = te->tb_curr;
412
- if (strcmp(tb->field, field) != 0) break;
413
- term.text = tb->text;
414
- tde->seek(tde, &term);
415
- sf->handle_term(index, tde, tb->text);
416
- } while (te->next(te));
417
- tde->close(tde);
418
- te->close(te);
398
+ TRY
399
+ tde = ir->term_docs(ir);
400
+ term.field = field;
401
+ term.text = "";
402
+ te = ir->terms_from(ir, &term);
403
+ if (te->tb_curr == NULL) {
404
+ RAISE(ARG_ERROR, NO_TERM_ERROR_MSG);
405
+ }
406
+
407
+ if (sf->type == SORT_TYPE_AUTO) {
408
+ sort_field_auto_evaluate(sf, te->tb_curr->text);
409
+ }
410
+
411
+ index = sf->create_index(length);
412
+
413
+ do {
414
+ tb = te->tb_curr;
415
+ if (strcmp(tb->field, field) != 0) break;
416
+ term.text = tb->text;
417
+ tde->seek(tde, &term);
418
+ sf->handle_term(index, tde, tb->text);
419
+ } while (te->next(te));
420
+ XFINALLY
421
+ tde->close(tde);
422
+ te->close(te);
423
+ XENDTRY
419
424
  }
420
425
  sf_clone = sort_field_clone(sf);
421
426
  sf_clone->index = index;
@@ -0,0 +1,338 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #include "header.h"
5
+
6
+ extern int danish_ISO_8859_1_stem(struct SN_env * z);
7
+ static int r_undouble(struct SN_env * z);
8
+ static int r_other_suffix(struct SN_env * z);
9
+ static int r_consonant_pair(struct SN_env * z);
10
+ static int r_main_suffix(struct SN_env * z);
11
+ static int r_mark_regions(struct SN_env * z);
12
+
13
+ extern struct SN_env * danish_ISO_8859_1_create_env(void);
14
+ extern void danish_ISO_8859_1_close_env(struct SN_env * z);
15
+
16
+ static symbol s_0_0[3] = { 'h', 'e', 'd' };
17
+ static symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' };
18
+ static symbol s_0_2[4] = { 'e', 'r', 'e', 'd' };
19
+ static symbol s_0_3[1] = { 'e' };
20
+ static symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' };
21
+ static symbol s_0_5[4] = { 'e', 'n', 'd', 'e' };
22
+ static symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' };
23
+ static symbol s_0_7[3] = { 'e', 'n', 'e' };
24
+ static symbol s_0_8[4] = { 'e', 'r', 'n', 'e' };
25
+ static symbol s_0_9[3] = { 'e', 'r', 'e' };
26
+ static symbol s_0_10[2] = { 'e', 'n' };
27
+ static symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' };
28
+ static symbol s_0_12[4] = { 'e', 'r', 'e', 'n' };
29
+ static symbol s_0_13[2] = { 'e', 'r' };
30
+ static symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' };
31
+ static symbol s_0_15[4] = { 'e', 'r', 'e', 'r' };
32
+ static symbol s_0_16[1] = { 's' };
33
+ static symbol s_0_17[4] = { 'h', 'e', 'd', 's' };
34
+ static symbol s_0_18[2] = { 'e', 's' };
35
+ static symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' };
36
+ static symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' };
37
+ static symbol s_0_21[4] = { 'e', 'n', 'e', 's' };
38
+ static symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' };
39
+ static symbol s_0_23[4] = { 'e', 'r', 'e', 's' };
40
+ static symbol s_0_24[3] = { 'e', 'n', 's' };
41
+ static symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' };
42
+ static symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' };
43
+ static symbol s_0_27[3] = { 'e', 'r', 's' };
44
+ static symbol s_0_28[3] = { 'e', 't', 's' };
45
+ static symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' };
46
+ static symbol s_0_30[2] = { 'e', 't' };
47
+ static symbol s_0_31[4] = { 'e', 'r', 'e', 't' };
48
+
49
+ static struct among a_0[32] =
50
+ {
51
+ /* 0 */ { 3, s_0_0, -1, 1, 0},
52
+ /* 1 */ { 5, s_0_1, 0, 1, 0},
53
+ /* 2 */ { 4, s_0_2, -1, 1, 0},
54
+ /* 3 */ { 1, s_0_3, -1, 1, 0},
55
+ /* 4 */ { 5, s_0_4, 3, 1, 0},
56
+ /* 5 */ { 4, s_0_5, 3, 1, 0},
57
+ /* 6 */ { 6, s_0_6, 5, 1, 0},
58
+ /* 7 */ { 3, s_0_7, 3, 1, 0},
59
+ /* 8 */ { 4, s_0_8, 3, 1, 0},
60
+ /* 9 */ { 3, s_0_9, 3, 1, 0},
61
+ /* 10 */ { 2, s_0_10, -1, 1, 0},
62
+ /* 11 */ { 5, s_0_11, 10, 1, 0},
63
+ /* 12 */ { 4, s_0_12, 10, 1, 0},
64
+ /* 13 */ { 2, s_0_13, -1, 1, 0},
65
+ /* 14 */ { 5, s_0_14, 13, 1, 0},
66
+ /* 15 */ { 4, s_0_15, 13, 1, 0},
67
+ /* 16 */ { 1, s_0_16, -1, 2, 0},
68
+ /* 17 */ { 4, s_0_17, 16, 1, 0},
69
+ /* 18 */ { 2, s_0_18, 16, 1, 0},
70
+ /* 19 */ { 5, s_0_19, 18, 1, 0},
71
+ /* 20 */ { 7, s_0_20, 19, 1, 0},
72
+ /* 21 */ { 4, s_0_21, 18, 1, 0},
73
+ /* 22 */ { 5, s_0_22, 18, 1, 0},
74
+ /* 23 */ { 4, s_0_23, 18, 1, 0},
75
+ /* 24 */ { 3, s_0_24, 16, 1, 0},
76
+ /* 25 */ { 6, s_0_25, 24, 1, 0},
77
+ /* 26 */ { 5, s_0_26, 24, 1, 0},
78
+ /* 27 */ { 3, s_0_27, 16, 1, 0},
79
+ /* 28 */ { 3, s_0_28, 16, 1, 0},
80
+ /* 29 */ { 5, s_0_29, 28, 1, 0},
81
+ /* 30 */ { 2, s_0_30, -1, 1, 0},
82
+ /* 31 */ { 4, s_0_31, 30, 1, 0}
83
+ };
84
+
85
+ static symbol s_1_0[2] = { 'g', 'd' };
86
+ static symbol s_1_1[2] = { 'd', 't' };
87
+ static symbol s_1_2[2] = { 'g', 't' };
88
+ static symbol s_1_3[2] = { 'k', 't' };
89
+
90
+ static struct among a_1[4] =
91
+ {
92
+ /* 0 */ { 2, s_1_0, -1, -1, 0},
93
+ /* 1 */ { 2, s_1_1, -1, -1, 0},
94
+ /* 2 */ { 2, s_1_2, -1, -1, 0},
95
+ /* 3 */ { 2, s_1_3, -1, -1, 0}
96
+ };
97
+
98
+ static symbol s_2_0[2] = { 'i', 'g' };
99
+ static symbol s_2_1[3] = { 'l', 'i', 'g' };
100
+ static symbol s_2_2[4] = { 'e', 'l', 'i', 'g' };
101
+ static symbol s_2_3[3] = { 'e', 'l', 's' };
102
+ static symbol s_2_4[4] = { 'l', 0xF8, 's', 't' };
103
+
104
+ static struct among a_2[5] =
105
+ {
106
+ /* 0 */ { 2, s_2_0, -1, 1, 0},
107
+ /* 1 */ { 3, s_2_1, 0, 1, 0},
108
+ /* 2 */ { 4, s_2_2, 1, 1, 0},
109
+ /* 3 */ { 3, s_2_3, -1, 1, 0},
110
+ /* 4 */ { 4, s_2_4, -1, 2, 0}
111
+ };
112
+
113
+ static unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
114
+
115
+ static unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
116
+
117
+ static symbol s_0[] = { 's', 't' };
118
+ static symbol s_1[] = { 'i', 'g' };
119
+ static symbol s_2[] = { 'l', 0xF8, 's' };
120
+
121
+ static int r_mark_regions(struct SN_env * z) {
122
+ z->I[0] = z->l;
123
+ { int c_test = z->c; /* test, line 33 */
124
+ { int c = z->c + 3;
125
+ if (0 > c || c > z->l) return 0;
126
+ z->c = c; /* hop, line 33 */
127
+ }
128
+ z->I[1] = z->c; /* setmark x, line 33 */
129
+ z->c = c_test;
130
+ }
131
+ while(1) { /* goto, line 34 */
132
+ int c = z->c;
133
+ if (!(in_grouping(z, g_v, 97, 248))) goto lab0;
134
+ z->c = c;
135
+ break;
136
+ lab0:
137
+ z->c = c;
138
+ if (z->c >= z->l) return 0;
139
+ z->c++; /* goto, line 34 */
140
+ }
141
+ while(1) { /* gopast, line 34 */
142
+ if (!(out_grouping(z, g_v, 97, 248))) goto lab1;
143
+ break;
144
+ lab1:
145
+ if (z->c >= z->l) return 0;
146
+ z->c++; /* gopast, line 34 */
147
+ }
148
+ z->I[0] = z->c; /* setmark p1, line 34 */
149
+ /* try, line 35 */
150
+ if (!(z->I[0] < z->I[1])) goto lab2;
151
+ z->I[0] = z->I[1];
152
+ lab2:
153
+ return 1;
154
+ }
155
+
156
+ static int r_main_suffix(struct SN_env * z) {
157
+ int among_var;
158
+ { int m3; /* setlimit, line 41 */
159
+ int m = z->l - z->c; (void) m;
160
+ if (z->c < z->I[0]) return 0;
161
+ z->c = z->I[0]; /* tomark, line 41 */
162
+ m3 = z->lb; z->lb = z->c;
163
+ z->c = z->l - m;
164
+ z->ket = z->c; /* [, line 41 */
165
+ among_var = find_among_b(z, a_0, 32); /* substring, line 41 */
166
+ if (!(among_var)) { z->lb = m3; return 0; }
167
+ z->bra = z->c; /* ], line 41 */
168
+ z->lb = m3;
169
+ }
170
+ switch(among_var) {
171
+ case 0: return 0;
172
+ case 1:
173
+ { int ret;
174
+ ret = slice_del(z); /* delete, line 48 */
175
+ if (ret < 0) return ret;
176
+ }
177
+ break;
178
+ case 2:
179
+ if (!(in_grouping_b(z, g_s_ending, 97, 229))) return 0;
180
+ { int ret;
181
+ ret = slice_del(z); /* delete, line 50 */
182
+ if (ret < 0) return ret;
183
+ }
184
+ break;
185
+ }
186
+ return 1;
187
+ }
188
+
189
+ static int r_consonant_pair(struct SN_env * z) {
190
+ { int m_test = z->l - z->c; /* test, line 55 */
191
+ { int m3; /* setlimit, line 56 */
192
+ int m = z->l - z->c; (void) m;
193
+ if (z->c < z->I[0]) return 0;
194
+ z->c = z->I[0]; /* tomark, line 56 */
195
+ m3 = z->lb; z->lb = z->c;
196
+ z->c = z->l - m;
197
+ z->ket = z->c; /* [, line 56 */
198
+ if (!(find_among_b(z, a_1, 4))) { z->lb = m3; return 0; } /* substring, line 56 */
199
+ z->bra = z->c; /* ], line 56 */
200
+ z->lb = m3;
201
+ }
202
+ z->c = z->l - m_test;
203
+ }
204
+ if (z->c <= z->lb) return 0;
205
+ z->c--; /* next, line 62 */
206
+ z->bra = z->c; /* ], line 62 */
207
+ { int ret;
208
+ ret = slice_del(z); /* delete, line 62 */
209
+ if (ret < 0) return ret;
210
+ }
211
+ return 1;
212
+ }
213
+
214
+ static int r_other_suffix(struct SN_env * z) {
215
+ int among_var;
216
+ { int m = z->l - z->c; (void) m; /* do, line 66 */
217
+ z->ket = z->c; /* [, line 66 */
218
+ if (!(eq_s_b(z, 2, s_0))) goto lab0;
219
+ z->bra = z->c; /* ], line 66 */
220
+ if (!(eq_s_b(z, 2, s_1))) goto lab0;
221
+ { int ret;
222
+ ret = slice_del(z); /* delete, line 66 */
223
+ if (ret < 0) return ret;
224
+ }
225
+ lab0:
226
+ z->c = z->l - m;
227
+ }
228
+ { int m3; /* setlimit, line 67 */
229
+ int m = z->l - z->c; (void) m;
230
+ if (z->c < z->I[0]) return 0;
231
+ z->c = z->I[0]; /* tomark, line 67 */
232
+ m3 = z->lb; z->lb = z->c;
233
+ z->c = z->l - m;
234
+ z->ket = z->c; /* [, line 67 */
235
+ among_var = find_among_b(z, a_2, 5); /* substring, line 67 */
236
+ if (!(among_var)) { z->lb = m3; return 0; }
237
+ z->bra = z->c; /* ], line 67 */
238
+ z->lb = m3;
239
+ }
240
+ switch(among_var) {
241
+ case 0: return 0;
242
+ case 1:
243
+ { int ret;
244
+ ret = slice_del(z); /* delete, line 70 */
245
+ if (ret < 0) return ret;
246
+ }
247
+ { int m = z->l - z->c; (void) m; /* do, line 70 */
248
+ { int ret = r_consonant_pair(z);
249
+ if (ret == 0) goto lab1; /* call consonant_pair, line 70 */
250
+ if (ret < 0) return ret;
251
+ }
252
+ lab1:
253
+ z->c = z->l - m;
254
+ }
255
+ break;
256
+ case 2:
257
+ { int ret;
258
+ ret = slice_from_s(z, 3, s_2); /* <-, line 72 */
259
+ if (ret < 0) return ret;
260
+ }
261
+ break;
262
+ }
263
+ return 1;
264
+ }
265
+
266
+ static int r_undouble(struct SN_env * z) {
267
+ { int m3; /* setlimit, line 76 */
268
+ int m = z->l - z->c; (void) m;
269
+ if (z->c < z->I[0]) return 0;
270
+ z->c = z->I[0]; /* tomark, line 76 */
271
+ m3 = z->lb; z->lb = z->c;
272
+ z->c = z->l - m;
273
+ z->ket = z->c; /* [, line 76 */
274
+ if (!(out_grouping_b(z, g_v, 97, 248))) { z->lb = m3; return 0; }
275
+ z->bra = z->c; /* ], line 76 */
276
+ z->S[0] = slice_to(z, z->S[0]); /* -> ch, line 76 */
277
+ if (z->S[0] == 0) return -1; /* -> ch, line 76 */
278
+ z->lb = m3;
279
+ }
280
+ if (!(eq_v_b(z, z->S[0]))) return 0; /* name ch, line 77 */
281
+ { int ret;
282
+ ret = slice_del(z); /* delete, line 78 */
283
+ if (ret < 0) return ret;
284
+ }
285
+ return 1;
286
+ }
287
+
288
+ extern int danish_ISO_8859_1_stem(struct SN_env * z) {
289
+ { int c = z->c; /* do, line 84 */
290
+ { int ret = r_mark_regions(z);
291
+ if (ret == 0) goto lab0; /* call mark_regions, line 84 */
292
+ if (ret < 0) return ret;
293
+ }
294
+ lab0:
295
+ z->c = c;
296
+ }
297
+ z->lb = z->c; z->c = z->l; /* backwards, line 85 */
298
+
299
+ { int m = z->l - z->c; (void) m; /* do, line 86 */
300
+ { int ret = r_main_suffix(z);
301
+ if (ret == 0) goto lab1; /* call main_suffix, line 86 */
302
+ if (ret < 0) return ret;
303
+ }
304
+ lab1:
305
+ z->c = z->l - m;
306
+ }
307
+ { int m = z->l - z->c; (void) m; /* do, line 87 */
308
+ { int ret = r_consonant_pair(z);
309
+ if (ret == 0) goto lab2; /* call consonant_pair, line 87 */
310
+ if (ret < 0) return ret;
311
+ }
312
+ lab2:
313
+ z->c = z->l - m;
314
+ }
315
+ { int m = z->l - z->c; (void) m; /* do, line 88 */
316
+ { int ret = r_other_suffix(z);
317
+ if (ret == 0) goto lab3; /* call other_suffix, line 88 */
318
+ if (ret < 0) return ret;
319
+ }
320
+ lab3:
321
+ z->c = z->l - m;
322
+ }
323
+ { int m = z->l - z->c; (void) m; /* do, line 89 */
324
+ { int ret = r_undouble(z);
325
+ if (ret == 0) goto lab4; /* call undouble, line 89 */
326
+ if (ret < 0) return ret;
327
+ }
328
+ lab4:
329
+ z->c = z->l - m;
330
+ }
331
+ z->c = z->lb;
332
+ return 1;
333
+ }
334
+
335
+ extern struct SN_env * danish_ISO_8859_1_create_env(void) { return SN_create_env(1, 2, 0); }
336
+
337
+ extern void danish_ISO_8859_1_close_env(struct SN_env * z) { SN_close_env(z); }
338
+