ferret 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/similarity.c CHANGED
@@ -29,58 +29,61 @@ setup_endian()
29
29
  float byte_to_float(uchar b)
30
30
  {
31
31
  char flt[4];
32
- if (b == 0)
32
+ if (b == 0) {
33
33
  return 0.0;
34
- int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
35
- int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
36
-
37
- if (!low_mid_bit) setup_endian();
38
- flt[low_bit] = flt[low_mid_bit] = 0;
39
- flt[high_mid_bit] = mantissa << 5;
40
- flt[high_bit] = exponent + 48;
41
- return *((float *)flt);
34
+ } else {
35
+ int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
36
+ int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
37
+
38
+ if (!low_mid_bit) setup_endian();
39
+ flt[low_bit] = flt[low_mid_bit] = 0;
40
+ flt[high_mid_bit] = mantissa << 5;
41
+ flt[high_bit] = exponent + 48;
42
+ return *((float *)flt);
43
+ }
42
44
  }
43
45
 
44
46
  uchar float_to_byte(float f)
45
47
  {
46
- if (f <= 0.0)
48
+ if (f <= 0.0) {
47
49
  return 0;
50
+ } else {
51
+ char *bits = (char *)&f;
52
+ int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
53
+ int exponent = (bits[high_bit] - 48);
54
+
55
+ if (exponent > 0x1f) {
56
+ exponent = 0x1f; // 0x1f = 31 = 0b00011111
57
+ mantissa = 0x07; // 0x07 = 7 = 0b00000111
58
+ }
48
59
 
49
- char *bits = (char *)&f;
50
- int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
51
- int exponent = (bits[high_bit] - 48);
52
-
53
- if (exponent > 0x1f) {
54
- exponent = 0x1f; // 0x1f = 31 = 0b00011111
55
- mantissa = 0x07; // 0x07 = 7 = 0b00000111
56
- }
60
+ if (exponent < 0) {
61
+ exponent = 0;
62
+ mantissa = 1;
63
+ }
57
64
 
58
- if (exponent < 0) {
59
- exponent = 0;
60
- mantissa = 1;
65
+ return ((exponent<<3) | mantissa);
61
66
  }
62
-
63
- return ((exponent<<3) | mantissa);
64
67
  }
65
68
 
66
69
  float simdef_length_norm(Similarity *s, char *field, int num_terms)
67
70
  {
68
- return 1.0 / sqrt(num_terms);
71
+ return (float)(1.0 / sqrt(num_terms));
69
72
  }
70
73
 
71
74
  float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
72
75
  {
73
- return 1.0 / sqrt(sum_of_squared_weights);
76
+ return (float)(1.0 / sqrt(sum_of_squared_weights));
74
77
  }
75
78
 
76
79
  float simdef_tf(struct Similarity *s, float freq)
77
80
  {
78
- return sqrt(freq);
81
+ return (float)sqrt(freq);
79
82
  }
80
83
 
81
84
  float simdef_sloppy_freq(struct Similarity *s, int distance)
82
85
  {
83
- return 1.0 / (float)(distance + 1);
86
+ return (float)(1.0 / (double)(distance + 1));
84
87
  }
85
88
 
86
89
  float simdef_idf_term(struct Similarity *s, Term *term, Searcher *searcher)
@@ -100,12 +103,12 @@ float simdef_idf_phrase(struct Similarity *s, Term **terms, int tcnt, Searcher *
100
103
 
101
104
  float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
102
105
  {
103
- return log((float)num_docs/(float)(doc_freq+1)) + 1.0;
106
+ return (float)(log((float)num_docs/(float)(doc_freq+1)) + 1.0);
104
107
  }
105
108
 
106
109
  float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
107
110
  {
108
- return (float)overlap / (float)max_overlap;
111
+ return (float)((double)overlap / (double)max_overlap);
109
112
  }
110
113
 
111
114
  float simdef_decode_norm(struct Similarity *s, uchar b)
@@ -113,16 +116,33 @@ float simdef_decode_norm(struct Similarity *s, uchar b)
113
116
  return s->norm_table[b];
114
117
  }
115
118
 
116
- float simdef_encode_norm(struct Similarity *s, float f)
119
+ uchar simdef_encode_norm(struct Similarity *s, float f)
117
120
  {
118
121
  return float_to_byte(f);
119
122
  }
120
123
 
121
- void simdef_destroy(void *p)
124
+ void simdef_destroy(Similarity *s)
122
125
  {
123
- // nothing to do here;
126
+ /* nothing to do here */
124
127
  }
125
128
 
129
+ #ifdef WIN32
130
+ static Similarity default_similarity = {
131
+ NULL,
132
+ {0},
133
+ &simdef_length_norm,
134
+ &simdef_query_norm,
135
+ &simdef_tf,
136
+ &simdef_sloppy_freq,
137
+ &simdef_idf_term,
138
+ &simdef_idf_phrase,
139
+ &simdef_idf,
140
+ &simdef_coord,
141
+ &simdef_decode_norm,
142
+ &simdef_encode_norm,
143
+ &simdef_destroy
144
+ };
145
+ #else
126
146
  static Similarity default_similarity = {
127
147
  data:NULL,
128
148
  length_norm:&simdef_length_norm,
@@ -137,6 +157,8 @@ static Similarity default_similarity = {
137
157
  encode_norm:&simdef_encode_norm,
138
158
  destroy:&simdef_destroy
139
159
  };
160
+ #endif
161
+
140
162
  Similarity *sim_create_default()
141
163
  {
142
164
  int i;
@@ -147,17 +169,4 @@ Similarity *sim_create_default()
147
169
  default_similarity.data = &default_similarity;
148
170
  }
149
171
  return &default_similarity;
150
-
151
- // s->length_norm = &simdef_length_norm;
152
- // s->query_norm = &simdef_query_norm;
153
- // s->tf = &simdef_tf;
154
- // s->sloppy_freq = &simdef_sloppy_freq;
155
- // s->idf_term = &simdef_idf_term;
156
- // s->idf_phrase = &simdef_idf_phrase;
157
- // s->idf = &simdef_idf;
158
- // s->coord = &simdef_coord;
159
- // s->decode_norm = &simdef_decode_norm;
160
- // s->encode_norm = &simdef_encode_norm;
161
- // s->destroy = &simdef_destroy;
162
- // return s;
163
172
  }
data/ext/similarity.h CHANGED
@@ -16,7 +16,7 @@ typedef struct Term {
16
16
 
17
17
  Term *term_clone(Term *term);
18
18
  Term *term_create(const char *field, char *text);
19
- void term_destroy(void *p);
19
+ void term_destroy(Term *self);
20
20
  int term_cmp(void *t1, void *t2);
21
21
  int term_eq(const void *t1, const void *t2);
22
22
  unsigned int term_hash(const void *t);
@@ -43,8 +43,8 @@ struct Similarity {
43
43
  float (*idf)(Similarity *self, int doc_freq, int num_docs);
44
44
  float (*coord)(Similarity *self, int overlap, int max_overlap);
45
45
  float (*decode_norm)(Similarity *self, uchar b);
46
- float (*encode_norm)(Similarity *self, float f);
47
- void (*destroy)(void *p);
46
+ uchar (*encode_norm)(Similarity *self, float f);
47
+ void (*destroy)(Similarity *self);
48
48
  };
49
49
 
50
50
  #define sim_length_norm(msim, field, num_terms) msim->length_norm(msim, field, num_terms)
data/ext/sort.c CHANGED
@@ -92,6 +92,43 @@ void sort_field_destroy(void *p)
92
92
  free(p);
93
93
  }
94
94
 
95
+ /*
96
+ * field:<type>!
97
+ */
98
+ char *sort_field_to_s(SortField *self)
99
+ {
100
+ char *str;
101
+ char *type = NULL;
102
+ switch (self->type) {
103
+ case SORT_TYPE_SCORE:
104
+ type = "<SCORE>";
105
+ break;
106
+ case SORT_TYPE_DOC:
107
+ type = "<DOC>";
108
+ break;
109
+ case SORT_TYPE_INTEGER:
110
+ type = "<integer>";
111
+ break;
112
+ case SORT_TYPE_FLOAT:
113
+ type = "<float>";
114
+ break;
115
+ case SORT_TYPE_STRING:
116
+ type = "<string>";
117
+ break;
118
+ case SORT_TYPE_AUTO:
119
+ type = "<auto>";
120
+ break;
121
+ }
122
+ if (self->field) {
123
+ str = ALLOC_N(char, 10 + strlen(self->field) + strlen(type));
124
+ sprintf(str, "%s:%s%s", self->field, type, (self->reverse ? "!" : ""));
125
+ } else {
126
+ str = ALLOC_N(char, 10 + strlen(type));
127
+ sprintf(str, "%s%s", type, (self->reverse ? "!" : ""));
128
+ }
129
+ return str;
130
+ }
131
+
95
132
  /***************************************************************************
96
133
  * ScoreSortField
97
134
  ***************************************************************************/
@@ -113,25 +150,27 @@ SortField *sort_field_score_create(bool reverse)
113
150
  }
114
151
 
115
152
  SortField SORT_FIELD_SCORE = {
116
- field:NULL,
117
- type:SORT_TYPE_SCORE,
118
- reverse:false,
119
- index:NULL,
120
- compare:&sf_score_compare,
121
- create_index:NULL,
122
- destroy_index:NULL,
123
- handle_term:NULL
153
+ MUTEX_INITIALIZER,
154
+ /* field */NULL,
155
+ /* type */SORT_TYPE_SCORE,
156
+ /* reverse */false,
157
+ /* index */NULL,
158
+ /* compare */&sf_score_compare,
159
+ /* create_index */NULL,
160
+ /* destroy_index */NULL,
161
+ /* handle_term */NULL
124
162
  };
125
163
 
126
164
  SortField SORT_FIELD_SCORE_REV = {
127
- field:NULL,
128
- type:SORT_TYPE_SCORE,
129
- reverse:true,
130
- index:NULL,
131
- compare:&sf_score_compare,
132
- create_index:NULL,
133
- destroy_index:NULL,
134
- handle_term:NULL
165
+ MUTEX_INITIALIZER,
166
+ /* field */NULL,
167
+ /* type */SORT_TYPE_SCORE,
168
+ /* reverse */true,
169
+ /* index */NULL,
170
+ /* compare */&sf_score_compare,
171
+ /* create_index */NULL,
172
+ /* destroy_index */NULL,
173
+ /* handle_term */NULL
135
174
  };
136
175
 
137
176
  /**************************************************************************
@@ -155,25 +194,27 @@ SortField *sort_field_doc_create(bool reverse)
155
194
  }
156
195
 
157
196
  SortField SORT_FIELD_DOC = {
158
- field:NULL,
159
- type:SORT_TYPE_DOC,
160
- reverse:false,
161
- index:NULL,
162
- compare:&sf_doc_compare,
163
- create_index:NULL,
164
- destroy_index:NULL,
165
- handle_term:NULL
197
+ MUTEX_INITIALIZER,
198
+ /* field */NULL,
199
+ /* type */SORT_TYPE_DOC,
200
+ /* reverse */false,
201
+ /* index */NULL,
202
+ /* compare */&sf_doc_compare,
203
+ /* create_index */NULL,
204
+ /* destroy_index */NULL,
205
+ /* handle_term */NULL
166
206
  };
167
207
 
168
208
  SortField SORT_FIELD_DOC_REV = {
169
- field:NULL,
170
- type:SORT_TYPE_DOC,
171
- reverse:true,
172
- index:NULL,
173
- compare:&sf_doc_compare,
174
- create_index:NULL,
175
- destroy_index:NULL,
176
- handle_term:NULL
209
+ MUTEX_INITIALIZER,
210
+ /* field */NULL,
211
+ /* type */SORT_TYPE_DOC,
212
+ /* reverse */true,
213
+ /* index */NULL,
214
+ /* compare */&sf_doc_compare,
215
+ /* create_index */NULL,
216
+ /* destroy_index */NULL,
217
+ /* handle_term */NULL
177
218
  };
178
219
 
179
220
  /***************************************************************************
@@ -278,11 +319,15 @@ typedef struct StringIndex {
278
319
  int sf_string_compare(void *index_ptr, Hit *hit1, Hit *hit2)
279
320
  {
280
321
  StringIndex *index = (StringIndex *)index_ptr;
322
+ return strcoll(index->values[index->index[hit1->doc]],
323
+ index->values[index->index[hit2->doc]]);
324
+ /*
281
325
  int val1 = index->index[hit1->doc];
282
326
  int val2 = index->index[hit2->doc];
283
327
  if (val1 > val2) return 1;
284
328
  else if (val1 < val2) return -1;
285
329
  else return 0;
330
+ */
286
331
  }
287
332
 
288
333
  void *sf_string_create_index(int size)
@@ -346,7 +391,7 @@ void sort_field_auto_evaluate(SortField *sf, char *text)
346
391
  {
347
392
  int int_val;
348
393
  float float_val;
349
- int text_len = 0, scan_len = 0;
394
+ size_t text_len = 0, scan_len = 0;
350
395
 
351
396
  text_len = strlen(text);
352
397
  sscanf(text, "%d%n", &int_val, &scan_len);
@@ -362,7 +407,6 @@ void sort_field_auto_evaluate(SortField *sf, char *text)
362
407
  }
363
408
  }
364
409
 
365
-
366
410
  SortField *sort_field_auto_create(char *field, bool reverse)
367
411
  {
368
412
  return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
@@ -502,8 +546,8 @@ Sorter *sorter_create(int size)
502
546
 
503
547
  bool fshq_less_than(void *hit1, void *hit2)
504
548
  {
505
- printf("Whoops, shouldn't call this.\n");
506
549
  int cmp = 0;
550
+ printf("Whoops, shouldn't call this.\n");
507
551
  if (cmp != 0) {
508
552
  return cmp;
509
553
  } else {
@@ -570,11 +614,11 @@ Hit *fshq_pq_pop(PriorityQueue *pq)
570
614
 
571
615
  inline void fshq_pq_up(PriorityQueue *pq)
572
616
  {
573
- int i,j;
574
- i = pq->count;
575
- j = i >> 1;
576
617
  Hit **heap = (Hit **)pq->heap;
577
- Hit *node = heap[i];
618
+ Hit *node;
619
+ int i = pq->count;
620
+ int j = i >> 1;
621
+ node = heap[i];
578
622
 
579
623
  while ((j > 0) && fshq_lt(heap[0], node, heap[j])) {
580
624
  heap[i] = heap[j];
@@ -584,16 +628,23 @@ inline void fshq_pq_up(PriorityQueue *pq)
584
628
  heap[i] = node;
585
629
  }
586
630
 
587
- void fshq_pq_push(PriorityQueue *pq, void *elem)
631
+ void fshq_pq_insert(PriorityQueue *pq, Hit *hit)
588
632
  {
589
- pq->count++;
590
- pq->heap[pq->count] = elem;
591
- fshq_pq_up(pq);
633
+ if (pq->count < pq->size) {
634
+ Hit *new_hit = ALLOC(Hit);
635
+ memcpy(new_hit, hit, sizeof(Hit));
636
+ pq->count++;
637
+ pq->heap[pq->count] = new_hit;
638
+ fshq_pq_up(pq);
639
+ } else if (pq->count > 0 &&
640
+ fshq_lt((Hit *)pq->heap[0], (Hit *)pq->heap[1], hit)) {
641
+ memcpy(pq->heap[1], hit, sizeof(Hit));
642
+ fshq_pq_down(pq);
643
+ }
592
644
  }
593
645
 
594
- void fshq_pq_destroy(void *p)
646
+ void fshq_pq_destroy(PriorityQueue *self)
595
647
  {
596
- PriorityQueue *self = (PriorityQueue *)p;
597
648
  sorter_destroy(self->heap[0]);
598
649
  pq_destroy(self);
599
650
  }
@@ -634,21 +685,18 @@ Sort *sort_create()
634
685
  void sort_clear(Sort *self)
635
686
  {
636
687
  int i;
637
- for (i = 0; i < self->sf_cnt; i++) {
638
- sort_field_destroy(self->sort_fields[i]);
688
+ if (self->destroy_all) {
689
+ for (i = 0; i < self->sf_cnt; i++) {
690
+ sort_field_destroy(self->sort_fields[i]);
691
+ }
639
692
  }
640
693
  self->sf_cnt = 0;
641
694
  }
642
695
 
643
696
  void sort_destroy(void *p)
644
697
  {
645
- int i;
646
698
  Sort *self = (Sort *)p;
647
- if (self->destroy_all) {
648
- for (i = 0; i < self->sf_cnt; i++) {
649
- sort_field_destroy(self->sort_fields[i]);
650
- }
651
- }
699
+ sort_clear(self);
652
700
  free(self->sort_fields);
653
701
  free(self);
654
702
  }
@@ -664,3 +712,34 @@ void sort_add_sort_field(Sort *self, SortField *sf)
664
712
  self->sf_cnt++;
665
713
  }
666
714
 
715
+ char *sort_to_s(Sort *self)
716
+ {
717
+ int i, len = 20;
718
+ char *s;
719
+ char *str;
720
+ char **sf_strs = ALLOC_N(char *, self->sf_cnt);
721
+
722
+ for (i = 0; i < self->sf_cnt; i++) {
723
+ sf_strs[i] = s = sort_field_to_s(self->sort_fields[i]);
724
+ len += (int)strlen(s) + 2;
725
+ }
726
+
727
+ str = ALLOC_N(char, len);
728
+ s = "Sort[";
729
+ len = (int)strlen(s);
730
+ memcpy(str, s, len);
731
+
732
+ s = str + len;
733
+ for (i = 0; i < self->sf_cnt; i++) {
734
+ sprintf(s, "%s, ", sf_strs[i]);
735
+ s += (int)strlen(s);
736
+ free(sf_strs[i]);
737
+ }
738
+ free(sf_strs);
739
+
740
+ if (self->sf_cnt > 0) {
741
+ s -= 2;
742
+ }
743
+ sprintf(s, "]");
744
+ return str;
745
+ }