ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -2,9 +2,18 @@
2
2
  #include <limits.h>
3
3
  #include "search.h"
4
4
  #include "array.h"
5
+ #include "symbol.h"
6
+ #include "internal.h"
5
7
 
6
8
  #define PhQ(query) ((PhraseQuery *)(query))
7
9
 
10
+ /**
11
+ * Use to sort the phrase positions into positional order. For phrase
12
+ * positions matching at the same position (a very unusual case) we order by
13
+ * first terms. The only real reason for the sorting by first terms is to get
14
+ * consistant order of positions when testing. Functionally it makes no
15
+ * difference.
16
+ */
8
17
  static int phrase_pos_cmp(const void *p1, const void *p2)
9
18
  {
10
19
  int pos1 = ((PhrasePosition *)p1)->pos;
@@ -43,6 +52,8 @@ typedef struct PhPos
43
52
  static bool pp_next(PhPos *self)
44
53
  {
45
54
  TermDocEnum *tpe = self->tpe;
55
+ assert(tpe);
56
+
46
57
  if (!tpe->next(tpe)) {
47
58
  tpe->close(tpe); /* close stream */
48
59
  self->tpe = NULL;
@@ -57,6 +68,8 @@ static bool pp_next(PhPos *self)
57
68
  static bool pp_skip_to(PhPos *self, int doc_num)
58
69
  {
59
70
  TermDocEnum *tpe = self->tpe;
71
+ assert(tpe);
72
+
60
73
  if (!tpe->skip_to(tpe, doc_num)) {
61
74
  tpe->close(tpe); /* close stream */
62
75
  self->tpe = NULL;
@@ -114,19 +127,15 @@ static int pp_pos_cmp(const void *const p1, const void *const p2)
114
127
 
115
128
  static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
116
129
  {
117
- /* docs will all be equal when this method is used */
118
- return pp1->position < pp2->position;
119
- /*
120
- if (PP(p)->doc == PP(p)->doc) {
121
- return PP(p)->position < PP(p)->position;
130
+ if (pp1->position == pp2->position) {
131
+ return pp1->offset < pp2->offset;
122
132
  }
123
133
  else {
124
- return PP(p)->doc < PP(p)->doc;
134
+ return pp1->position < pp2->position;
125
135
  }
126
- */
127
136
  }
128
137
 
129
- void pp_destroy(PhPos *pp)
138
+ static void pp_destroy(PhPos *pp)
130
139
  {
131
140
  if (pp->tpe) {
132
141
  pp->tpe->close(pp->tpe);
@@ -134,7 +143,7 @@ void pp_destroy(PhPos *pp)
134
143
  free(pp);
135
144
  }
136
145
 
137
- PhPos *pp_new(TermDocEnum *tpe, int offset)
146
+ static PhPos *pp_new(TermDocEnum *tpe, int offset)
138
147
  {
139
148
  PhPos *self = ALLOC(PhPos);
140
149
 
@@ -165,6 +174,7 @@ typedef struct PhraseScorer
165
174
  int slop;
166
175
  bool first_time : 1;
167
176
  bool more : 1;
177
+ bool check_repeats : 1;
168
178
  } PhraseScorer;
169
179
 
170
180
  static void phsc_init(PhraseScorer *phsc)
@@ -232,7 +242,7 @@ static float phsc_score(Scorer *self)
232
242
  /* normalize */
233
243
  return raw_score * sim_decode_norm(
234
244
  self->similarity,
235
- phsc->norms[phsc->phrase_pos[phsc->pp_first_idx]->doc]);
245
+ phsc->norms[self->doc]);
236
246
  }
237
247
 
238
248
  static bool phsc_next(Scorer *self)
@@ -276,8 +286,8 @@ static Explanation *phsc_explain(Scorer *self, int doc_num)
276
286
 
277
287
  phsc_skip_to(self, doc_num);
278
288
 
279
- phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
280
- return expl_new(sim_tf(self->similarity, phrase_freq),
289
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0f;
290
+ return expl_new(sim_tf(self->similarity, phrase_freq),
281
291
  "tf(phrase_freq=%f)", phrase_freq);
282
292
  }
283
293
 
@@ -292,12 +302,17 @@ static void phsc_destroy(Scorer *self)
292
302
  scorer_destroy_i(self);
293
303
  }
294
304
 
295
- static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum,
305
+ static Scorer *phsc_new(Weight *weight,
306
+ TermDocEnum **term_pos_enum,
296
307
  PhrasePosition *positions, int pos_cnt,
297
- Similarity *similarity, uchar *norms)
308
+ Similarity *similarity,
309
+ uchar *norms,
310
+ int slop)
298
311
  {
299
312
  int i;
300
313
  Scorer *self = scorer_new(PhraseScorer, similarity);
314
+ HashSet *term_set = NULL;
315
+
301
316
 
302
317
  PhSc(self)->weight = weight;
303
318
  PhSc(self)->norms = norms;
@@ -305,14 +320,34 @@ static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum,
305
320
  PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
306
321
  PhSc(self)->pp_first_idx = 0;
307
322
  PhSc(self)->pp_cnt = pos_cnt;
308
- PhSc(self)->slop = 0;
323
+ PhSc(self)->slop = slop;
309
324
  PhSc(self)->first_time = true;
310
325
  PhSc(self)->more = true;
311
-
326
+ PhSc(self)->check_repeats = false;
327
+
328
+ if (slop) {
329
+ term_set = hs_new_str((free_ft)NULL);
330
+ }
312
331
  for (i = 0; i < pos_cnt; i++) {
332
+ /* check for repeats */
333
+ if (slop && !PhSc(self)->check_repeats) {
334
+ char **terms = positions[i].terms;
335
+ const int t_cnt = ary_size(terms);
336
+ int j;
337
+ for (j = 0; j < t_cnt; j++) {
338
+ if (hs_add(term_set, terms[j])) {
339
+ PhSc(self)->check_repeats = true;
340
+ break;
341
+ }
342
+ }
343
+ }
313
344
  PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
314
345
  }
315
346
 
347
+ if (slop) {
348
+ hs_destroy(term_set);
349
+ }
350
+
316
351
  self->score = &phsc_score;
317
352
  self->next = &phsc_next;
318
353
  self->skip_to = &phsc_skip_to;
@@ -363,7 +398,7 @@ static float ephsc_phrase_freq(Scorer *self)
363
398
  freq += 1.0; /* all equal: a match */
364
399
  } while (pp_next_position(last));
365
400
 
366
- /* maintain first position */
401
+ /* maintain first position */
367
402
  phsc->pp_first_idx = pp_first_idx;
368
403
  return freq;
369
404
  }
@@ -373,8 +408,13 @@ static Scorer *exact_phrase_scorer_new(Weight *weight,
373
408
  PhrasePosition *positions, int pp_cnt,
374
409
  Similarity *similarity, uchar *norms)
375
410
  {
376
- Scorer *self =
377
- phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
411
+ Scorer *self = phsc_new(weight,
412
+ term_pos_enum,
413
+ positions,
414
+ pp_cnt,
415
+ similarity,
416
+ norms,
417
+ 0);
378
418
 
379
419
  PhSc(self)->phrase_freq = &ephsc_phrase_freq;
380
420
  return self;
@@ -384,6 +424,33 @@ static Scorer *exact_phrase_scorer_new(Weight *weight,
384
424
  * SloppyPhraseScorer
385
425
  ***************************************************************************/
386
426
 
427
+ static bool sphsc_check_repeats(PhPos *pp,
428
+ PhPos **positions,
429
+ const int p_cnt)
430
+ {
431
+ int j;
432
+ for (j = 0; j < p_cnt; j++) {
433
+ PhPos *ppj = positions[j];
434
+ /* If offsets are equal, either we are at the current PhPos +pp+ or
435
+ * +pp+ and +ppj+ are supposed to match in the same position in which
436
+ * case we don't need to check. */
437
+ if (ppj->offset == pp->offset) {
438
+ continue;
439
+ }
440
+ /* the two phrase positions are matching on the same term
441
+ * which we want to avoid */
442
+ if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) {
443
+ if (!pp_next_position(pp)) {
444
+ /* We have no matches for this document */
445
+ return false;
446
+ }
447
+ /* we changed the position so we need to start check again */
448
+ j = -1;
449
+ }
450
+ }
451
+ return true;
452
+ }
453
+
387
454
  static float sphsc_phrase_freq(Scorer *self)
388
455
  {
389
456
  PhraseScorer *phsc = PhSc(self);
@@ -393,11 +460,21 @@ static float sphsc_phrase_freq(Scorer *self)
393
460
 
394
461
  int last_pos = 0, pos, next_pos, start, match_length, i;
395
462
  bool done = false;
463
+ bool check_repeats = phsc->check_repeats;
396
464
  float freq = 0.0;
397
465
 
398
466
  for (i = 0; i < pp_cnt; i++) {
467
+ bool res;
399
468
  pp = phsc->phrase_pos[i];
400
- pp_first_position(pp);
469
+ /* we should always have at least one position or this functions
470
+ * shouldn't have been called. */
471
+ res = pp_first_position(pp);
472
+ assert(res);(void)res;
473
+ if (check_repeats && i > 0) {
474
+ if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) {
475
+ goto return_freq;
476
+ }
477
+ }
401
478
  if (pp->position > last_pos) {
402
479
  last_pos = pp->position;
403
480
  }
@@ -405,13 +482,15 @@ static float sphsc_phrase_freq(Scorer *self)
405
482
  }
406
483
 
407
484
  do {
408
- pp = pq_pop(pq);
485
+ pp = (PhPos *)pq_pop(pq);
409
486
  pos = start = pp->position;
410
487
  next_pos = PP(pq_top(pq))->position;
411
488
  while (pos <= next_pos) {
412
489
  start = pos; /* advance pp to min window */
413
- if (!pp_next_position(pp)) {
414
- done = true; /* ran out of a positions for a term - done */
490
+ if (!pp_next_position(pp)
491
+ || (check_repeats
492
+ && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) {
493
+ done = true;
415
494
  break;
416
495
  }
417
496
  pos = pp->position;
@@ -429,6 +508,8 @@ static float sphsc_phrase_freq(Scorer *self)
429
508
  pq_push(pq, pp); /* restore pq */
430
509
  } while (!done);
431
510
 
511
+ return_freq:
512
+
432
513
  pq_destroy(pq);
433
514
  return freq;
434
515
  }
@@ -439,10 +520,14 @@ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
439
520
  int pp_cnt, Similarity *similarity,
440
521
  int slop, uchar *norms)
441
522
  {
442
- Scorer *self =
443
- phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
523
+ Scorer *self = phsc_new(weight,
524
+ term_pos_enum,
525
+ positions,
526
+ pp_cnt,
527
+ similarity,
528
+ norms,
529
+ slop);
444
530
 
445
- PhSc(self)->slop = slop;
446
531
  PhSc(self)->phrase_freq = &sphsc_phrase_freq;
447
532
  return self;
448
533
  }
@@ -467,7 +552,7 @@ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
467
552
  PhrasePosition *positions = phq->positions;
468
553
  const int pos_cnt = phq->pos_cnt;
469
554
  const int field_num = fis_get_field_num(ir->fis, phq->field);
470
-
555
+
471
556
  if (pos_cnt == 0 || field_num < 0) {
472
557
  return NULL;
473
558
  }
@@ -484,15 +569,8 @@ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
484
569
  else {
485
570
  tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
486
571
  }
487
- if (tps[i] == NULL) {
488
- /* free everything we just created and return NULL */
489
- int j;
490
- for (j = 0; j < i; j++) {
491
- tps[i]->close(tps[i]);
492
- }
493
- free(tps);
494
- return NULL;
495
- }
572
+ /* neither mtdpe_new nor ir->term_positions should return NULL */
573
+ assert(NULL != tps[i]);
496
574
  }
497
575
 
498
576
  if (phq->slop == 0) { /* optimize exact (common) case */
@@ -509,7 +587,7 @@ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
509
587
  return phsc;
510
588
  }
511
589
 
512
- Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
590
+ static Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
513
591
  {
514
592
  Explanation *expl;
515
593
  Explanation *idf_expl1;
@@ -530,12 +608,13 @@ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
530
608
  char *doc_freqs = NULL;
531
609
  size_t len = 0, pos = 0;
532
610
  const int field_num = fis_get_field_num(ir->fis, phq->field);
611
+ const char *field = S(phq->field);
533
612
 
534
613
  if (field_num < 0) {
535
- return expl_new(0.0, "field \"%s\" does not exist in the index", phq->field);
614
+ return expl_new(0.0, "field \"%s\" does not exist in the index", field);
536
615
  }
537
-
538
- query_str = self->query->to_s(self->query, "");
616
+
617
+ query_str = self->query->to_s(self->query, NULL);
539
618
 
540
619
  expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
541
620
 
@@ -554,16 +633,15 @@ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
554
633
  const int t_cnt = ary_size(terms);
555
634
  for (j = 0; j < t_cnt; j++) {
556
635
  char *term = terms[j];
557
- sprintf(doc_freqs + pos, "%s=%d, ",
558
- term, ir->doc_freq(ir, field_num, term));
559
- pos += strlen(doc_freqs + pos);
636
+ pos += sprintf(doc_freqs + pos, "%s=%d, ",
637
+ term, ir->doc_freq(ir, field_num, term));
560
638
  }
561
639
  }
562
640
  pos -= 2; /* remove ", " from the end */
563
641
  doc_freqs[pos] = 0;
564
642
 
565
- idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
566
- idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
643
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
644
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
567
645
  free(doc_freqs);
568
646
 
569
647
  /* explain query weight */
@@ -597,7 +675,7 @@ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
597
675
  ? sim_decode_norm(self->similarity, field_norms[doc_num])
598
676
  : (float)0.0;
599
677
  field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
600
- phq->field, doc_num);
678
+ field, doc_num);
601
679
 
602
680
  expl_add_detail(field_expl, field_norm_expl);
603
681
 
@@ -644,7 +722,7 @@ typedef struct TVPosEnum
644
722
  int size;
645
723
  int offset;
646
724
  int pos;
647
- int positions[];
725
+ int positions[1];
648
726
  } TVPosEnum;
649
727
 
650
728
  static bool tvpe_next(TVPosEnum *self)
@@ -684,8 +762,7 @@ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
684
762
 
685
763
  static TVPosEnum *tvpe_new(int *positions, int size, int offset)
686
764
  {
687
- TVPosEnum *self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
688
- + size * sizeof(int));
765
+ TVPosEnum *self = (TVPosEnum*)emalloc(sizeof(TVPosEnum) + size*sizeof(int));
689
766
  memcpy(self->positions, positions, size * sizeof(int));
690
767
  self->size = size;
691
768
  self->offset = offset;
@@ -705,13 +782,11 @@ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
705
782
  TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
706
783
  if (tv_term) {
707
784
  TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
708
- if (tvpe_next(tvpe)) {
709
- pq_push(tvpe_pq, tvpe);
710
- total_positions += tv_term->freq;
711
- }
712
- else {
713
- free(tvpe);
714
- }
785
+ /* got tv_term so tvpe_next should always return true once here */
786
+ bool res = tvpe_next(tvpe);
787
+ assert(res);(void)res;
788
+ pq_push(tvpe_pq, tvpe);
789
+ total_positions += tv_term->freq;
715
790
  }
716
791
  }
717
792
  if (tvpe_pq->size == 0) {
@@ -759,7 +834,7 @@ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
759
834
  static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
760
835
  TermVector *tv)
761
836
  {
762
- if (strcmp(tv->field, PhQ(self)->field) == 0) {
837
+ if (tv->field == PhQ(self)->field) {
763
838
  const int pos_cnt = PhQ(self)->pos_cnt;
764
839
  int i;
765
840
  int slop = PhQ(self)->slop;
@@ -785,7 +860,7 @@ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
785
860
  }
786
861
  }
787
862
  while (! done) {
788
- TVPosEnum *tvpe = pq_pop(tvpe_pq);
863
+ TVPosEnum *tvpe = (TVPosEnum *)pq_pop(tvpe_pq);
789
864
  int pos;
790
865
  int start = pos = tvpe->pos;
791
866
  int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
@@ -840,7 +915,7 @@ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
840
915
 
841
916
  first = tvpe_a[0];
842
917
  last = tvpe_a[pos_cnt - 1];
843
-
918
+
844
919
  while (!done) {
845
920
  while (first->pos < last->pos) {
846
921
  if (tvpe_skip_to(first, last->pos)) {
@@ -855,7 +930,7 @@ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
855
930
  }
856
931
  if (!done) {
857
932
  matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
858
- tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
933
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
859
934
  }
860
935
  if (!tvpe_next(last)) {
861
936
  done = true;
@@ -887,19 +962,21 @@ static void phq_extract_terms(Query *self, HashSet *term_set)
887
962
  }
888
963
  }
889
964
 
890
- static char *phq_to_s(Query *self, const char *field)
965
+ static char *phq_to_s(Query *self, Symbol default_field)
891
966
  {
892
967
  PhraseQuery *phq = PhQ(self);
893
968
  const int pos_cnt = phq->pos_cnt;
894
969
  PhrasePosition *positions = phq->positions;
970
+ const char *field = S(phq->field);
971
+ int flen = strlen(field);
895
972
 
896
973
  int i, j, buf_index = 0, pos, last_pos;
897
974
  size_t len = 0;
898
975
  char *buffer;
899
976
 
900
977
  if (phq->pos_cnt == 0) {
901
- if (strcmp(field, phq->field) != 0) {
902
- return strfmt("%s:\"\"", phq->field);
978
+ if (default_field != phq->field) {
979
+ return strfmt("%s:\"\"", field);
903
980
  }
904
981
  else {
905
982
  return estrdup("\"\"");
@@ -909,7 +986,7 @@ static char *phq_to_s(Query *self, const char *field)
909
986
  /* sort the phrase positions by position */
910
987
  qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
911
988
 
912
- len = strlen(phq->field) + 1;
989
+ len = flen + 1;
913
990
 
914
991
  for (i = 0; i < pos_cnt; i++) {
915
992
  char **terms = phq->positions[i].terms;
@@ -924,11 +1001,10 @@ static char *phq_to_s(Query *self, const char *field)
924
1001
 
925
1002
  buffer = ALLOC_N(char, len);
926
1003
 
927
- if (strcmp(field, phq->field) != 0) {
928
- len = strlen(phq->field);
929
- memcpy(buffer, phq->field, len);
930
- buffer[len] = ':';
931
- buf_index += len + 1;
1004
+ if (default_field != phq->field) {
1005
+ memcpy(buffer, field, flen);
1006
+ buffer[flen] = ':';
1007
+ buf_index += flen + 1;
932
1008
  }
933
1009
 
934
1010
  buffer[buf_index++] = '"';
@@ -968,8 +1044,7 @@ static char *phq_to_s(Query *self, const char *field)
968
1044
  buffer[buf_index] = 0;
969
1045
 
970
1046
  if (phq->slop != 0) {
971
- sprintf(buffer + buf_index, "~%d", phq->slop);
972
- buf_index += strlen(buffer + buf_index);
1047
+ buf_index += sprintf(buffer + buf_index, "~%d", phq->slop);
973
1048
  }
974
1049
 
975
1050
  if (self->boost != 1.0) {
@@ -984,7 +1059,6 @@ static void phq_destroy(Query *self)
984
1059
  {
985
1060
  PhraseQuery *phq = PhQ(self);
986
1061
  int i;
987
- free(phq->field);
988
1062
  for (i = 0; i < phq->pos_cnt; i++) {
989
1063
  ary_destroy(phq->positions[i].terms, &free);
990
1064
  }
@@ -1024,12 +1098,12 @@ static unsigned long phq_hash(Query *self)
1024
1098
  {
1025
1099
  int i, j;
1026
1100
  PhraseQuery *phq = PhQ(self);
1027
- unsigned long hash = str_hash(phq->field);
1101
+ unsigned long hash = sym_hash(phq->field);
1028
1102
  for (i = 0; i < phq->pos_cnt; i++) {
1029
1103
  char **terms = phq->positions[i].terms;
1030
1104
  for (j = ary_size(terms) - 1; j >= 0; j--) {
1031
1105
  hash = (hash << 1) ^ (str_hash(terms[j])
1032
- ^ phq->positions[i].pos);
1106
+ ^ phq->positions[i].pos);
1033
1107
  }
1034
1108
  }
1035
1109
  return (hash ^ phq->slop);
@@ -1041,7 +1115,7 @@ static int phq_eq(Query *self, Query *o)
1041
1115
  PhraseQuery *phq1 = PhQ(self);
1042
1116
  PhraseQuery *phq2 = PhQ(o);
1043
1117
  if (phq1->slop != phq2->slop
1044
- || strcmp(phq1->field, phq2->field) != 0
1118
+ || phq1->field != phq2->field
1045
1119
  || phq1->pos_cnt != phq2->pos_cnt) {
1046
1120
  return false;
1047
1121
  }
@@ -1049,7 +1123,7 @@ static int phq_eq(Query *self, Query *o)
1049
1123
  char **terms1 = phq1->positions[i].terms;
1050
1124
  char **terms2 = phq2->positions[i].terms;
1051
1125
  const int t_cnt = ary_size(terms1);
1052
- if (t_cnt != ary_size(terms2)
1126
+ if (t_cnt != ary_size(terms2)
1053
1127
  || phq1->positions[i].pos != phq2->positions[i].pos) {
1054
1128
  return false;
1055
1129
  }
@@ -1062,11 +1136,11 @@ static int phq_eq(Query *self, Query *o)
1062
1136
  return true;
1063
1137
  }
1064
1138
 
1065
- Query *phq_new(const char *field)
1139
+ Query *phq_new(Symbol field)
1066
1140
  {
1067
1141
  Query *self = q_new(PhraseQuery);
1068
1142
 
1069
- PhQ(self)->field = estrdup(field);
1143
+ PhQ(self)->field = field;
1070
1144
  PhQ(self)->pos_cnt = 0;
1071
1145
  PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1072
1146
  PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
@@ -1105,7 +1179,7 @@ void phq_add_term(Query *self, const char *term, int pos_inc)
1105
1179
  int position;
1106
1180
  if (phq->pos_cnt == 0) {
1107
1181
  position = 0;
1108
- }
1182
+ }
1109
1183
  else {
1110
1184
  position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1111
1185
  }
@@ -1124,3 +1198,8 @@ void phq_append_multi_term(Query *self, const char *term)
1124
1198
  ary_push(phq->positions[index].terms, estrdup(term));
1125
1199
  }
1126
1200
  }
1201
+
1202
+ void frt_phq_set_slop(FrtQuery *self, int slop)
1203
+ {
1204
+ PhQ(self)->slop = slop;
1205
+ }