ferret 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/similarity.c CHANGED
@@ -29,58 +29,61 @@ setup_endian()
29
29
  float byte_to_float(uchar b)
30
30
  {
31
31
  char flt[4];
32
- if (b == 0)
32
+ if (b == 0) {
33
33
  return 0.0;
34
- int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
35
- int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
36
-
37
- if (!low_mid_bit) setup_endian();
38
- flt[low_bit] = flt[low_mid_bit] = 0;
39
- flt[high_mid_bit] = mantissa << 5;
40
- flt[high_bit] = exponent + 48;
41
- return *((float *)flt);
34
+ } else {
35
+ int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
36
+ int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
37
+
38
+ if (!low_mid_bit) setup_endian();
39
+ flt[low_bit] = flt[low_mid_bit] = 0;
40
+ flt[high_mid_bit] = mantissa << 5;
41
+ flt[high_bit] = exponent + 48;
42
+ return *((float *)flt);
43
+ }
42
44
  }
43
45
 
44
46
  uchar float_to_byte(float f)
45
47
  {
46
- if (f <= 0.0)
48
+ if (f <= 0.0) {
47
49
  return 0;
50
+ } else {
51
+ char *bits = (char *)&f;
52
+ int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
53
+ int exponent = (bits[high_bit] - 48);
54
+
55
+ if (exponent > 0x1f) {
56
+ exponent = 0x1f; // 0x1f = 31 = 0b00011111
57
+ mantissa = 0x07; // 0x07 = 7 = 0b00000111
58
+ }
48
59
 
49
- char *bits = (char *)&f;
50
- int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
51
- int exponent = (bits[high_bit] - 48);
52
-
53
- if (exponent > 0x1f) {
54
- exponent = 0x1f; // 0x1f = 31 = 0b00011111
55
- mantissa = 0x07; // 0x07 = 7 = 0b00000111
56
- }
60
+ if (exponent < 0) {
61
+ exponent = 0;
62
+ mantissa = 1;
63
+ }
57
64
 
58
- if (exponent < 0) {
59
- exponent = 0;
60
- mantissa = 1;
65
+ return ((exponent<<3) | mantissa);
61
66
  }
62
-
63
- return ((exponent<<3) | mantissa);
64
67
  }
65
68
 
66
69
  float simdef_length_norm(Similarity *s, char *field, int num_terms)
67
70
  {
68
- return 1.0 / sqrt(num_terms);
71
+ return (float)(1.0 / sqrt(num_terms));
69
72
  }
70
73
 
71
74
  float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
72
75
  {
73
- return 1.0 / sqrt(sum_of_squared_weights);
76
+ return (float)(1.0 / sqrt(sum_of_squared_weights));
74
77
  }
75
78
 
76
79
  float simdef_tf(struct Similarity *s, float freq)
77
80
  {
78
- return sqrt(freq);
81
+ return (float)sqrt(freq);
79
82
  }
80
83
 
81
84
  float simdef_sloppy_freq(struct Similarity *s, int distance)
82
85
  {
83
- return 1.0 / (float)(distance + 1);
86
+ return (float)(1.0 / (double)(distance + 1));
84
87
  }
85
88
 
86
89
  float simdef_idf_term(struct Similarity *s, Term *term, Searcher *searcher)
@@ -100,12 +103,12 @@ float simdef_idf_phrase(struct Similarity *s, Term **terms, int tcnt, Searcher *
100
103
 
101
104
  float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
102
105
  {
103
- return log((float)num_docs/(float)(doc_freq+1)) + 1.0;
106
+ return (float)(log((float)num_docs/(float)(doc_freq+1)) + 1.0);
104
107
  }
105
108
 
106
109
  float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
107
110
  {
108
- return (float)overlap / (float)max_overlap;
111
+ return (float)((double)overlap / (double)max_overlap);
109
112
  }
110
113
 
111
114
  float simdef_decode_norm(struct Similarity *s, uchar b)
@@ -113,16 +116,33 @@ float simdef_decode_norm(struct Similarity *s, uchar b)
113
116
  return s->norm_table[b];
114
117
  }
115
118
 
116
- float simdef_encode_norm(struct Similarity *s, float f)
119
+ uchar simdef_encode_norm(struct Similarity *s, float f)
117
120
  {
118
121
  return float_to_byte(f);
119
122
  }
120
123
 
121
- void simdef_destroy(void *p)
124
+ void simdef_destroy(Similarity *s)
122
125
  {
123
- // nothing to do here;
126
+ /* nothing to do here */
124
127
  }
125
128
 
129
+ #ifdef WIN32
130
+ static Similarity default_similarity = {
131
+ NULL,
132
+ {0},
133
+ &simdef_length_norm,
134
+ &simdef_query_norm,
135
+ &simdef_tf,
136
+ &simdef_sloppy_freq,
137
+ &simdef_idf_term,
138
+ &simdef_idf_phrase,
139
+ &simdef_idf,
140
+ &simdef_coord,
141
+ &simdef_decode_norm,
142
+ &simdef_encode_norm,
143
+ &simdef_destroy
144
+ };
145
+ #else
126
146
  static Similarity default_similarity = {
127
147
  data:NULL,
128
148
  length_norm:&simdef_length_norm,
@@ -137,6 +157,8 @@ static Similarity default_similarity = {
137
157
  encode_norm:&simdef_encode_norm,
138
158
  destroy:&simdef_destroy
139
159
  };
160
+ #endif
161
+
140
162
  Similarity *sim_create_default()
141
163
  {
142
164
  int i;
@@ -147,17 +169,4 @@ Similarity *sim_create_default()
147
169
  default_similarity.data = &default_similarity;
148
170
  }
149
171
  return &default_similarity;
150
-
151
- // s->length_norm = &simdef_length_norm;
152
- // s->query_norm = &simdef_query_norm;
153
- // s->tf = &simdef_tf;
154
- // s->sloppy_freq = &simdef_sloppy_freq;
155
- // s->idf_term = &simdef_idf_term;
156
- // s->idf_phrase = &simdef_idf_phrase;
157
- // s->idf = &simdef_idf;
158
- // s->coord = &simdef_coord;
159
- // s->decode_norm = &simdef_decode_norm;
160
- // s->encode_norm = &simdef_encode_norm;
161
- // s->destroy = &simdef_destroy;
162
- // return s;
163
172
  }
data/ext/similarity.h CHANGED
@@ -16,7 +16,7 @@ typedef struct Term {
16
16
 
17
17
  Term *term_clone(Term *term);
18
18
  Term *term_create(const char *field, char *text);
19
- void term_destroy(void *p);
19
+ void term_destroy(Term *self);
20
20
  int term_cmp(void *t1, void *t2);
21
21
  int term_eq(const void *t1, const void *t2);
22
22
  unsigned int term_hash(const void *t);
@@ -43,8 +43,8 @@ struct Similarity {
43
43
  float (*idf)(Similarity *self, int doc_freq, int num_docs);
44
44
  float (*coord)(Similarity *self, int overlap, int max_overlap);
45
45
  float (*decode_norm)(Similarity *self, uchar b);
46
- float (*encode_norm)(Similarity *self, float f);
47
- void (*destroy)(void *p);
46
+ uchar (*encode_norm)(Similarity *self, float f);
47
+ void (*destroy)(Similarity *self);
48
48
  };
49
49
 
50
50
  #define sim_length_norm(msim, field, num_terms) msim->length_norm(msim, field, num_terms)
data/ext/sort.c CHANGED
@@ -92,6 +92,43 @@ void sort_field_destroy(void *p)
92
92
  free(p);
93
93
  }
94
94
 
95
+ /*
96
+ * field:<type>!
97
+ */
98
+ char *sort_field_to_s(SortField *self)
99
+ {
100
+ char *str;
101
+ char *type = NULL;
102
+ switch (self->type) {
103
+ case SORT_TYPE_SCORE:
104
+ type = "<SCORE>";
105
+ break;
106
+ case SORT_TYPE_DOC:
107
+ type = "<DOC>";
108
+ break;
109
+ case SORT_TYPE_INTEGER:
110
+ type = "<integer>";
111
+ break;
112
+ case SORT_TYPE_FLOAT:
113
+ type = "<float>";
114
+ break;
115
+ case SORT_TYPE_STRING:
116
+ type = "<string>";
117
+ break;
118
+ case SORT_TYPE_AUTO:
119
+ type = "<auto>";
120
+ break;
121
+ }
122
+ if (self->field) {
123
+ str = ALLOC_N(char, 10 + strlen(self->field) + strlen(type));
124
+ sprintf(str, "%s:%s%s", self->field, type, (self->reverse ? "!" : ""));
125
+ } else {
126
+ str = ALLOC_N(char, 10 + strlen(type));
127
+ sprintf(str, "%s%s", type, (self->reverse ? "!" : ""));
128
+ }
129
+ return str;
130
+ }
131
+
95
132
  /***************************************************************************
96
133
  * ScoreSortField
97
134
  ***************************************************************************/
@@ -113,25 +150,27 @@ SortField *sort_field_score_create(bool reverse)
113
150
  }
114
151
 
115
152
  SortField SORT_FIELD_SCORE = {
116
- field:NULL,
117
- type:SORT_TYPE_SCORE,
118
- reverse:false,
119
- index:NULL,
120
- compare:&sf_score_compare,
121
- create_index:NULL,
122
- destroy_index:NULL,
123
- handle_term:NULL
153
+ MUTEX_INITIALIZER,
154
+ /* field */NULL,
155
+ /* type */SORT_TYPE_SCORE,
156
+ /* reverse */false,
157
+ /* index */NULL,
158
+ /* compare */&sf_score_compare,
159
+ /* create_index */NULL,
160
+ /* destroy_index */NULL,
161
+ /* handle_term */NULL
124
162
  };
125
163
 
126
164
  SortField SORT_FIELD_SCORE_REV = {
127
- field:NULL,
128
- type:SORT_TYPE_SCORE,
129
- reverse:true,
130
- index:NULL,
131
- compare:&sf_score_compare,
132
- create_index:NULL,
133
- destroy_index:NULL,
134
- handle_term:NULL
165
+ MUTEX_INITIALIZER,
166
+ /* field */NULL,
167
+ /* type */SORT_TYPE_SCORE,
168
+ /* reverse */true,
169
+ /* index */NULL,
170
+ /* compare */&sf_score_compare,
171
+ /* create_index */NULL,
172
+ /* destroy_index */NULL,
173
+ /* handle_term */NULL
135
174
  };
136
175
 
137
176
  /**************************************************************************
@@ -155,25 +194,27 @@ SortField *sort_field_doc_create(bool reverse)
155
194
  }
156
195
 
157
196
  SortField SORT_FIELD_DOC = {
158
- field:NULL,
159
- type:SORT_TYPE_DOC,
160
- reverse:false,
161
- index:NULL,
162
- compare:&sf_doc_compare,
163
- create_index:NULL,
164
- destroy_index:NULL,
165
- handle_term:NULL
197
+ MUTEX_INITIALIZER,
198
+ /* field */NULL,
199
+ /* type */SORT_TYPE_DOC,
200
+ /* reverse */false,
201
+ /* index */NULL,
202
+ /* compare */&sf_doc_compare,
203
+ /* create_index */NULL,
204
+ /* destroy_index */NULL,
205
+ /* handle_term */NULL
166
206
  };
167
207
 
168
208
  SortField SORT_FIELD_DOC_REV = {
169
- field:NULL,
170
- type:SORT_TYPE_DOC,
171
- reverse:true,
172
- index:NULL,
173
- compare:&sf_doc_compare,
174
- create_index:NULL,
175
- destroy_index:NULL,
176
- handle_term:NULL
209
+ MUTEX_INITIALIZER,
210
+ /* field */NULL,
211
+ /* type */SORT_TYPE_DOC,
212
+ /* reverse */true,
213
+ /* index */NULL,
214
+ /* compare */&sf_doc_compare,
215
+ /* create_index */NULL,
216
+ /* destroy_index */NULL,
217
+ /* handle_term */NULL
177
218
  };
178
219
 
179
220
  /***************************************************************************
@@ -278,11 +319,15 @@ typedef struct StringIndex {
278
319
  int sf_string_compare(void *index_ptr, Hit *hit1, Hit *hit2)
279
320
  {
280
321
  StringIndex *index = (StringIndex *)index_ptr;
322
+ return strcoll(index->values[index->index[hit1->doc]],
323
+ index->values[index->index[hit2->doc]]);
324
+ /*
281
325
  int val1 = index->index[hit1->doc];
282
326
  int val2 = index->index[hit2->doc];
283
327
  if (val1 > val2) return 1;
284
328
  else if (val1 < val2) return -1;
285
329
  else return 0;
330
+ */
286
331
  }
287
332
 
288
333
  void *sf_string_create_index(int size)
@@ -346,7 +391,7 @@ void sort_field_auto_evaluate(SortField *sf, char *text)
346
391
  {
347
392
  int int_val;
348
393
  float float_val;
349
- int text_len = 0, scan_len = 0;
394
+ size_t text_len = 0, scan_len = 0;
350
395
 
351
396
  text_len = strlen(text);
352
397
  sscanf(text, "%d%n", &int_val, &scan_len);
@@ -362,7 +407,6 @@ void sort_field_auto_evaluate(SortField *sf, char *text)
362
407
  }
363
408
  }
364
409
 
365
-
366
410
  SortField *sort_field_auto_create(char *field, bool reverse)
367
411
  {
368
412
  return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
@@ -502,8 +546,8 @@ Sorter *sorter_create(int size)
502
546
 
503
547
  bool fshq_less_than(void *hit1, void *hit2)
504
548
  {
505
- printf("Whoops, shouldn't call this.\n");
506
549
  int cmp = 0;
550
+ printf("Whoops, shouldn't call this.\n");
507
551
  if (cmp != 0) {
508
552
  return cmp;
509
553
  } else {
@@ -570,11 +614,11 @@ Hit *fshq_pq_pop(PriorityQueue *pq)
570
614
 
571
615
  inline void fshq_pq_up(PriorityQueue *pq)
572
616
  {
573
- int i,j;
574
- i = pq->count;
575
- j = i >> 1;
576
617
  Hit **heap = (Hit **)pq->heap;
577
- Hit *node = heap[i];
618
+ Hit *node;
619
+ int i = pq->count;
620
+ int j = i >> 1;
621
+ node = heap[i];
578
622
 
579
623
  while ((j > 0) && fshq_lt(heap[0], node, heap[j])) {
580
624
  heap[i] = heap[j];
@@ -584,16 +628,23 @@ inline void fshq_pq_up(PriorityQueue *pq)
584
628
  heap[i] = node;
585
629
  }
586
630
 
587
- void fshq_pq_push(PriorityQueue *pq, void *elem)
631
+ void fshq_pq_insert(PriorityQueue *pq, Hit *hit)
588
632
  {
589
- pq->count++;
590
- pq->heap[pq->count] = elem;
591
- fshq_pq_up(pq);
633
+ if (pq->count < pq->size) {
634
+ Hit *new_hit = ALLOC(Hit);
635
+ memcpy(new_hit, hit, sizeof(Hit));
636
+ pq->count++;
637
+ pq->heap[pq->count] = new_hit;
638
+ fshq_pq_up(pq);
639
+ } else if (pq->count > 0 &&
640
+ fshq_lt((Hit *)pq->heap[0], (Hit *)pq->heap[1], hit)) {
641
+ memcpy(pq->heap[1], hit, sizeof(Hit));
642
+ fshq_pq_down(pq);
643
+ }
592
644
  }
593
645
 
594
- void fshq_pq_destroy(void *p)
646
+ void fshq_pq_destroy(PriorityQueue *self)
595
647
  {
596
- PriorityQueue *self = (PriorityQueue *)p;
597
648
  sorter_destroy(self->heap[0]);
598
649
  pq_destroy(self);
599
650
  }
@@ -634,21 +685,18 @@ Sort *sort_create()
634
685
  void sort_clear(Sort *self)
635
686
  {
636
687
  int i;
637
- for (i = 0; i < self->sf_cnt; i++) {
638
- sort_field_destroy(self->sort_fields[i]);
688
+ if (self->destroy_all) {
689
+ for (i = 0; i < self->sf_cnt; i++) {
690
+ sort_field_destroy(self->sort_fields[i]);
691
+ }
639
692
  }
640
693
  self->sf_cnt = 0;
641
694
  }
642
695
 
643
696
  void sort_destroy(void *p)
644
697
  {
645
- int i;
646
698
  Sort *self = (Sort *)p;
647
- if (self->destroy_all) {
648
- for (i = 0; i < self->sf_cnt; i++) {
649
- sort_field_destroy(self->sort_fields[i]);
650
- }
651
- }
699
+ sort_clear(self);
652
700
  free(self->sort_fields);
653
701
  free(self);
654
702
  }
@@ -664,3 +712,34 @@ void sort_add_sort_field(Sort *self, SortField *sf)
664
712
  self->sf_cnt++;
665
713
  }
666
714
 
715
+ char *sort_to_s(Sort *self)
716
+ {
717
+ int i, len = 20;
718
+ char *s;
719
+ char *str;
720
+ char **sf_strs = ALLOC_N(char *, self->sf_cnt);
721
+
722
+ for (i = 0; i < self->sf_cnt; i++) {
723
+ sf_strs[i] = s = sort_field_to_s(self->sort_fields[i]);
724
+ len += (int)strlen(s) + 2;
725
+ }
726
+
727
+ str = ALLOC_N(char, len);
728
+ s = "Sort[";
729
+ len = (int)strlen(s);
730
+ memcpy(str, s, len);
731
+
732
+ s = str + len;
733
+ for (i = 0; i < self->sf_cnt; i++) {
734
+ sprintf(s, "%s, ", sf_strs[i]);
735
+ s += (int)strlen(s);
736
+ free(sf_strs[i]);
737
+ }
738
+ free(sf_strs);
739
+
740
+ if (self->sf_cnt > 0) {
741
+ s -= 2;
742
+ }
743
+ sprintf(s, "]");
744
+ return str;
745
+ }