ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_term.c DELETED
@@ -1,219 +0,0 @@
1
- #include "ferret.h"
2
- #include "index.h"
3
-
4
- VALUE cTerm;
5
- /****************************************************************************
6
- *
7
- * Term Methods
8
- *
9
- ****************************************************************************/
10
-
11
- typedef struct RTerm {
12
- VALUE field;
13
- VALUE text;
14
- } RTerm;
15
-
16
- void
17
- frt_term_mark(void *p)
18
- {
19
- RTerm *term = (RTerm *)p;
20
- rb_gc_mark(term->field);
21
- rb_gc_mark(term->text);
22
- }
23
-
24
- static VALUE
25
- frt_term_alloc(VALUE klass)
26
- {
27
- RTerm *term = ALLOC(RTerm);
28
- term->field = Qnil;
29
- term->text = Qnil;
30
- return Data_Wrap_Struct(klass, &frt_term_mark, &free, term);
31
- }
32
-
33
- #define GET_TERM RTerm *term; Data_Get_Struct(self, RTerm, term)
34
- static VALUE
35
- frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
36
- {
37
- GET_TERM;
38
- term->field = rb_obj_as_string(rfield);
39
- term->text = rb_obj_as_string(rtext);
40
- return self;
41
- }
42
-
43
- VALUE
44
- frt_get_rterm(char *field, char *text)
45
- {
46
- RTerm *rterm = ALLOC(RTerm);
47
- rterm->field = rb_str_new2(field);
48
- rterm->text = rb_str_new2(text);
49
- return Data_Wrap_Struct(cTerm, &frt_term_mark, &free, rterm);
50
- }
51
-
52
- Term *
53
- frt_set_term(VALUE self, Term *t)
54
- {
55
- GET_TERM;
56
- t->field = RSTRING(term->field)->ptr;
57
- t->text = RSTRING(term->text)->ptr;
58
- return t;
59
- }
60
-
61
- Term *
62
- frt_get_term(VALUE self)
63
- {
64
- Term *t = NULL;
65
- if (self != Qnil) {
66
- GET_TERM;
67
- t = ALLOC(Term);
68
- /* store text and field in text so that field will be freed with text */
69
- t->text = ALLOC_N(char, RSTRING(term->text)->len +
70
- RSTRING(term->field)->len + 2);
71
- sprintf(t->text, "%s %s", RSTRING(term->text)->ptr,
72
- RSTRING(term->field)->ptr);
73
- t->text[RSTRING(term->text)->len] = '\0';
74
- t->field = t->text + RSTRING(term->text)->len + 1;
75
- }
76
- return t;
77
- }
78
-
79
- static VALUE
80
- frt_term_get_text(VALUE self)
81
- {
82
- GET_TERM;
83
- return term->text;
84
- }
85
-
86
- static VALUE
87
- frt_term_set_text(VALUE self, VALUE rtext)
88
- {
89
- GET_TERM;
90
- term->text = rb_obj_as_string(rtext);
91
- return Qnil;
92
- }
93
-
94
- static VALUE
95
- frt_term_get_field(VALUE self)
96
- {
97
- GET_TERM;
98
- return term->field;
99
- }
100
-
101
- static VALUE
102
- frt_term_set_field(VALUE self, VALUE rfield)
103
- {
104
- GET_TERM;
105
- term->field = rb_obj_as_string(rfield);
106
- return Qnil;
107
- }
108
-
109
- VALUE
110
- frt_term_to_s(VALUE self)
111
- {
112
- int tlen, flen;
113
- char *res;
114
- GET_TERM;
115
- tlen = RSTRING(term->text)->len;
116
- flen = RSTRING(term->field)->len;
117
- res = alloca(flen + tlen + 1);
118
-
119
- MEMCPY(res, StringValuePtr(term->field), char, flen);
120
- res[flen] = ':';
121
- MEMCPY(res + flen + 1, StringValuePtr(term->text), char, tlen);
122
- return rb_str_new(res, tlen + flen + 1 );
123
- }
124
-
125
- inline int
126
- frt_term_cmp(RTerm *t1, RTerm *t2)
127
- {
128
- int comp = rb_str_cmp(t1->field, t2->field);
129
- if (comp == 0) {
130
- comp = rb_str_cmp(t1->text, t2->text);
131
- }
132
- return comp;
133
- }
134
-
135
- int
136
- frt_term_compare_to_int(VALUE self, VALUE rother)
137
- {
138
- RTerm *other;
139
- GET_TERM;
140
- Data_Get_Struct(rother, RTerm, other);
141
- return frt_term_cmp(term, other);
142
- }
143
-
144
- VALUE
145
- frt_term_lt(VALUE self, VALUE rother)
146
- {
147
- return frt_term_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
148
- }
149
-
150
- VALUE
151
- frt_term_gt(VALUE self, VALUE rother)
152
- {
153
- return frt_term_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
154
- }
155
-
156
- VALUE
157
- frt_term_le(VALUE self, VALUE rother)
158
- {
159
- return frt_term_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
160
- }
161
-
162
- VALUE
163
- frt_term_ge(VALUE self, VALUE rother)
164
- {
165
- return frt_term_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
166
- }
167
-
168
- VALUE
169
- frt_term_eq(VALUE self, VALUE rother)
170
- {
171
- if (rother == Qnil)
172
- return Qfalse;
173
- return frt_term_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
174
- }
175
-
176
-
177
- static VALUE
178
- frt_term_compare_to(VALUE self, VALUE other)
179
- {
180
- return INT2FIX(frt_term_compare_to_int(self, other));
181
- }
182
-
183
- static VALUE
184
- frt_term_hash(VALUE self)
185
- {
186
- GET_TERM;
187
- return INT2FIX(rb_str_hash(term->field) + rb_str_hash(term->text));
188
- }
189
-
190
- /****************************************************************************
191
- *
192
- * Init Function
193
- *
194
- ****************************************************************************/
195
-
196
- void
197
- Init_term(void)
198
- {
199
- /* Term */
200
- cTerm = rb_define_class_under(mIndex, "Term", rb_cObject);
201
- rb_define_alloc_func(cTerm, frt_term_alloc);
202
- rb_include_module(cTerm, rb_mComparable);
203
-
204
- rb_define_method(cTerm, "initialize", frt_term_init, 2);
205
- rb_define_method(cTerm, "set!", frt_term_init, 2);
206
- rb_define_method(cTerm, "to_s", frt_term_to_s, 0);
207
- rb_define_method(cTerm, "<=>", frt_term_compare_to, 1);
208
- rb_define_method(cTerm, "<", frt_term_lt, 1);
209
- rb_define_method(cTerm, ">", frt_term_gt, 1);
210
- rb_define_method(cTerm, "<=", frt_term_le, 1);
211
- rb_define_method(cTerm, ">=", frt_term_ge, 1);
212
- rb_define_method(cTerm, "eql?", frt_term_eq, 1);
213
- rb_define_method(cTerm, "==", frt_term_eq, 1);
214
- rb_define_method(cTerm, "text", frt_term_get_text, 0);
215
- rb_define_method(cTerm, "text=", frt_term_set_text, 1);
216
- rb_define_method(cTerm, "field", frt_term_get_field, 0);
217
- rb_define_method(cTerm, "field=", frt_term_set_field, 1);
218
- rb_define_method(cTerm, "hash", frt_term_hash, 0);
219
- }
data/ext/term.c DELETED
@@ -1,820 +0,0 @@
1
- #include "index.h"
2
- #include "helper.h"
3
- #include "hash.h"
4
- #include <string.h>
5
-
6
- static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
7
- static char * const TERM_ORDER_ERROR_MSG = "term out of order";
8
- static char * const FP_ORDER_ERROR_MSG = "freq pointer out of order";
9
- static char * const PP_ORDER_ERROR_MSG = "prox pointer out of order";
10
-
11
- /****************************************************************************
12
- *
13
- * Term
14
- *
15
- ****************************************************************************/
16
-
17
- Term *term_clone(Term *term)
18
- {
19
- Term *t = ALLOC(Term);
20
-
21
- t->field = term->field;
22
- t->text = estrdup(term->text);
23
- return t;
24
- }
25
-
26
- Term *term_create(const char *field, char *text)
27
- {
28
- Term *t = ALLOC(Term);
29
-
30
- t->field = (char *)field;
31
- t->text = estrdup(text);
32
- return t;
33
- }
34
-
35
- void term_destroy(Term *self)
36
- {
37
- free(self->text);
38
- free(self);
39
- }
40
-
41
- int term_cmp(void *t1, void *t2)
42
- {
43
- int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
44
- if (res != 0) {
45
- return res;
46
- } else {
47
- return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
48
- }
49
- }
50
-
51
- int term_eq(const void *t1, const void *t2)
52
- {
53
- return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
54
- (strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
55
- }
56
-
57
- unsigned int term_hash(const void *t)
58
- {
59
- return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
60
- }
61
-
62
- char *term_to_s(Term *term)
63
- {
64
- char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
65
- sprintf(string, "%s:%s", term->field, term->text);
66
- return string;
67
- }
68
-
69
- /****************************************************************************
70
- *
71
- * TermBuffer
72
- *
73
- ****************************************************************************/
74
-
75
- void tb_reset(TermBuffer *tb)
76
- {
77
- tb->field = (char *)EMPTY_STRING;
78
- tb->text[0] = '\0';
79
- }
80
-
81
- TermBuffer *tb_create()
82
- {
83
- TermBuffer *tb = ALLOC(TermBuffer);
84
- tb->field = (char *)EMPTY_STRING;
85
- tb->text[0] = '\0';
86
- return tb;
87
- }
88
-
89
- void tb_destroy(TermBuffer *tb)
90
- {
91
- free(tb);
92
- }
93
-
94
- TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
95
- {
96
- tb->field = t->field;
97
- strcpy(tb->text, t->text);
98
- return tb;
99
- }
100
-
101
- Term *tb_get_term(TermBuffer *tb)
102
- {
103
- return term_create(tb->field, tb->text);
104
- }
105
-
106
- int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
107
- {
108
- int res;
109
- if ((tb1->field != tb2->field) &&
110
- (0 != (res = strcmp(tb1->field, tb2->field)))) {
111
- return res;
112
- } else {
113
- return strcmp(tb1->text, tb2->text);
114
- }
115
- }
116
-
117
- int tb_term_cmp(TermBuffer *tb, Term *t)
118
- {
119
- int res = strcmp(tb->field, t->field);
120
- if (res != 0) {
121
- return res;
122
- } else {
123
- return strcmp(tb->text, t->text);
124
- }
125
- }
126
-
127
- TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
128
- {
129
- tb1->field = tb2->field;
130
- strcpy(tb1->text, tb2->text);
131
- return tb1;
132
- }
133
-
134
- TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
135
- {
136
- signed int fnum;
137
- int start = (int)is_read_vint(is);
138
- int length = (int)is_read_vint(is);
139
- int total_length = start + length;
140
- is_read_bytes(is, (uchar *)tb->text, start, length);
141
- tb->text[total_length] = '\0';
142
- fnum = (signed int)is_read_vint(is);
143
- if (fnum < 0)
144
- tb->field = (char *)EMPTY_STRING;
145
- else
146
- tb->field = fis->by_number[fnum]->name;
147
- return tb;
148
- }
149
-
150
- /****************************************************************************
151
- *
152
- * TermInfo
153
- *
154
- ****************************************************************************/
155
-
156
- TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
157
- {
158
- TermInfo *ti = ALLOC(TermInfo);
159
- ti->doc_freq = doc_freq;
160
- ti->freq_pointer = freq_pointer;
161
- ti->prox_pointer = prox_pointer;
162
- ti->skip_offset = skip_offset;
163
- return ti;
164
- }
165
-
166
- TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
167
- {
168
- ti->doc_freq = doc_freq;
169
- ti->freq_pointer = freq_pointer;
170
- ti->prox_pointer = prox_pointer;
171
- ti->skip_offset = skip_offset;
172
- return ti;
173
- }
174
-
175
- void ti_destroy(TermInfo *ti)
176
- {
177
- free(ti);
178
- }
179
-
180
- TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
181
- {
182
- memcpy(ti, other, sizeof(TermInfo));
183
- return ti;
184
- }
185
-
186
- TermInfo *ti_clone(TermInfo *other)
187
- {
188
- return ti_create(other->doc_freq,
189
- other->freq_pointer, other->prox_pointer, other->skip_offset);
190
- }
191
-
192
- int ti_eq(TermInfo *ti, TermInfo *other)
193
- {
194
- return (memcmp(ti, other, sizeof(TermInfo)) == 0);
195
- }
196
-
197
- /****************************************************************************
198
- *
199
- * TermEnum
200
- *
201
- ****************************************************************************/
202
-
203
- TermEnum *te_create()
204
- {
205
- TermEnum *te = ALLOC(TermEnum);
206
- te->tb_curr = tb_create();
207
- te->tb_prev = tb_create();
208
- te->ti_curr = ti_create(0, 0, 0, 0);
209
- return te;
210
- }
211
-
212
- void te_destroy(TermEnum *te)
213
- {
214
- tb_destroy(te->tb_curr);
215
- tb_destroy(te->tb_prev);
216
- ti_destroy(te->ti_curr);
217
- free(te);
218
- }
219
-
220
- Term *te_get_term(TermEnum *te)
221
- {
222
- return tb_get_term(te->tb_curr);
223
- }
224
-
225
- TermInfo *te_get_ti(TermEnum *te)
226
- {
227
- TermInfo *ti = te->ti_curr;
228
- return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
229
- }
230
-
231
- TermBuffer *te_skip_to(TermEnum *te, Term *t)
232
- {
233
- TermBuffer *tb_curr;
234
- if (tb_term_cmp(te->tb_curr, t) == 0)
235
- return te->tb_curr;
236
-
237
- while (((tb_curr = te->next(te)) != NULL) &&
238
- (tb_term_cmp(tb_curr, t) < 0)) {
239
- }
240
- return tb_curr;
241
- }
242
-
243
- /****************************************************************************
244
- *
245
- * SegmentTermEnum
246
- *
247
- ****************************************************************************/
248
-
249
- #define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data
250
-
251
- TermBuffer *ste_next(TermEnum *te)
252
- {
253
- GET_STE;
254
- TermInfo *ti;
255
- InStream *is = ste->is;
256
- ste->pos++;
257
- if (ste->pos > ste->size - 1) {
258
- tb_reset(te->tb_curr);
259
- return NULL;
260
- }
261
-
262
- tb_cpy(te->tb_prev, te->tb_curr);
263
- tb_read(te->tb_curr, is, ste->fis);
264
-
265
- ti = te->ti_curr;
266
- ti->doc_freq = (int)is_read_vint(is); /* read doc freq */
267
- ti->freq_pointer += (int)is_read_vint(is);/* read freq pointer */
268
- ti->prox_pointer += (int)is_read_vint(is);/* read prox pointer */
269
-
270
- if (ste->format == -1) {
271
- /* just read skip_offset in order to increment file pointer
272
- * value is never used since skip_to is switched off */
273
- if (!ste->is_index) {
274
- if (ti->doc_freq > ste->format_m1skip_interval) {
275
- ti->skip_offset = (int)is_read_vint(is);
276
- }
277
- }
278
- } else {
279
- if (ti->doc_freq >= ste->skip_interval) {
280
- ti->skip_offset = (int)is_read_vint(is);
281
- }
282
- }
283
-
284
- if (ste->is_index) {
285
- ste->index_pointer += (int)is_read_vint(is); /* read index pointer */
286
- }
287
-
288
- return te->tb_curr;
289
- }
290
-
291
- TermEnum *ste_clone(TermEnum *other_te);
292
-
293
- TermEnum *ste_allocate()
294
- {
295
- TermEnum *te = te_create();
296
- SegmentTermEnum *ste;
297
-
298
- te->next = &ste_next;
299
- te->close = &ste_close;
300
- te->clone = &ste_clone;
301
- ste = ALLOC(SegmentTermEnum);
302
- te->data = ste;
303
- return te;
304
- }
305
-
306
- TermEnum *ste_clone(TermEnum *other_te)
307
- {
308
- SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
309
- TermEnum *te = ste_allocate();
310
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
311
-
312
- memcpy(ste, other_ste, sizeof(SegmentTermEnum));
313
- ste->is = is_clone(other_ste->is);
314
- tb_cpy(te->tb_curr, other_te->tb_curr);
315
- tb_cpy(te->tb_prev, other_te->tb_prev);
316
- ti_cpy(te->ti_curr, other_te->ti_curr);
317
- return te;
318
- }
319
-
320
- void ste_close(TermEnum *te)
321
- {
322
- GET_STE;
323
- is_close(ste->is);
324
- free(ste);
325
- te->data = NULL;
326
- te_destroy(te);
327
- }
328
-
329
- TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
330
- {
331
- TermEnum *te = ste_allocate();
332
- GET_STE;
333
- int first_int;
334
-
335
- ste->fis = fis;
336
- ste->is_index = is_index;
337
- ste->is = is;
338
- ste->pos = -1;
339
- ste->index_pointer = 0;
340
- ste->format_m1skip_interval = -1;
341
-
342
- first_int = (int)is_read_int(is);
343
-
344
- if (first_int >= 0) {
345
- /* original-format file, without explicit format version number */
346
- ste->format = 0;
347
- ste->size = first_int;
348
-
349
- /* back-compatible settings */
350
- ste->index_interval = 128;
351
- ste->skip_interval = INT_MAX; /* switch off skip_to optimization */
352
-
353
- } else {
354
- /* check that it is a format we can understand */
355
- if (first_int < TERM_INFO_FORMAT)
356
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
357
-
358
- /* we have a format version number */
359
- ste->format = first_int;
360
-
361
-
362
- ste->size = (int)is_read_long(is); /* read the size */
363
-
364
- if (ste->format == -1) {
365
- if (!ste->is_index) {
366
- ste->index_interval = is_read_int(is);
367
- ste->format_m1skip_interval = is_read_int(is);
368
- }
369
- /* switch off skip_to optimization for file format prior to
370
- * 1.4rc2 in order to avoid a bug in skip_to implementation
371
- * of these versions */
372
- ste->skip_interval = INT_MAX;
373
- } else {
374
- ste->index_interval = is_read_int(is);
375
- ste->skip_interval = is_read_int(is);
376
- }
377
- }
378
- return te;
379
- }
380
-
381
- void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
382
- {
383
- GET_STE;
384
- is_seek(ste->is, pointer);
385
- ste->pos = pos;
386
- tb_set_term(te->tb_curr, t);
387
- tb_reset(te->tb_prev);
388
- ti_cpy(te->ti_curr, ti);
389
- }
390
-
391
- TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
392
- {
393
- te_skip_to(te, t);
394
-
395
- if (tb_term_cmp(te->tb_curr, t) == 0) {
396
- return te_get_ti(te);
397
- } else {
398
- return NULL;
399
- }
400
- }
401
-
402
- Term *ste_scan_for_term(TermEnum *te, int pos)
403
- {
404
- GET_STE;
405
- while (ste->pos < pos) {
406
- if (ste_next(te) == NULL)
407
- return NULL;
408
- }
409
-
410
- return te_get_term(te);
411
- }
412
-
413
- /****************************************************************************
414
- *
415
- * MultiTermEnum
416
- *
417
- ****************************************************************************/
418
-
419
- #define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data
420
-
421
- TermBuffer *mte_next(TermEnum *te)
422
- {
423
- GET_MTE;
424
- SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
425
-
426
- if (top == NULL) {
427
- tb_reset(te->tb_curr);
428
- return false;
429
- }
430
-
431
- tb_cpy(te->tb_prev, te->tb_curr);
432
- tb_cpy(te->tb_curr, top->tb);
433
-
434
- te->ti_curr->doc_freq = 0;
435
-
436
- while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
437
- pq_pop(mte->smi_queue);
438
- te->ti_curr->doc_freq += top->te->ti_curr->doc_freq;/* increment freq */
439
- if (smi_next(top)) {
440
- pq_push(mte->smi_queue, top); /* restore queue */
441
- } else {
442
- smi_destroy(top); /* done with a segment */
443
- }
444
- top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
445
- }
446
- return te->tb_curr;
447
- }
448
-
449
- void mte_close(TermEnum *te)
450
- {
451
- GET_MTE;
452
- pq_clear(mte->smi_queue);
453
- pq_destroy(mte->smi_queue);
454
- free(mte);
455
- te_destroy(te);
456
- }
457
-
458
- TermEnum *mte_clone(TermEnum *te)
459
- {
460
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
461
- return NULL;
462
- }
463
-
464
- TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
465
- {
466
- int i;
467
- IndexReader *reader;
468
- TermEnum *sub_te;
469
- MultiTermEnum *mte = ALLOC(MultiTermEnum);
470
- TermEnum *te = te_create();
471
- te->next = &mte_next;
472
- te->clone = &mte_clone;
473
- te->close = &mte_close;
474
-
475
- te->data = mte;
476
-
477
- mte->smi_queue = pq_create(rcnt, (lt_ft)&smi_lt);
478
- mte->smi_queue->free_elem = (free_ft)&smi_destroy;
479
-
480
- for (i = 0; i < rcnt; i++) {
481
- SegmentMergeInfo *smi;
482
- reader = readers[i];
483
-
484
- if (t != NULL) {
485
- sub_te = reader->terms_from(reader, t);
486
- } else {
487
- sub_te = reader->terms(reader);
488
- }
489
-
490
- smi = smi_create(starts[i], sub_te, reader);
491
- if (((t == NULL) && smi_next(smi)) ||
492
- (sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
493
- pq_push(mte->smi_queue, smi); /* initialize queue */
494
- } else {
495
- smi_destroy(smi);
496
- }
497
- }
498
-
499
- if ((t != NULL) && (mte->smi_queue->count > 0)) {
500
- mte_next(te);
501
- }
502
-
503
- return te;
504
- }
505
-
506
- /****************************************************************************
507
- *
508
- * TermInfosWriter
509
- *
510
- ****************************************************************************/
511
-
512
- const Term EmptyTerm = {"", ""};
513
-
514
- TermInfosWriter *tiw_open_internal(Store *store,
515
- char *segment,
516
- FieldInfos *fis,
517
- int interval,
518
- int is_index)
519
- {
520
- char fname[SEGMENT_NAME_MAX_LENGTH];
521
- TermInfosWriter *tiw = ALLOC(TermInfosWriter);
522
- OutStream *os;
523
-
524
- tiw->index_interval = interval;
525
- tiw->skip_interval = 16;
526
- tiw->last_index_pointer = 0;
527
- tiw->last_term = (Term *)&EmptyTerm;
528
- tiw->last_term_info = ti_create(0,0,0,0);
529
- tiw->size = 0;
530
- tiw->is_index = is_index;
531
- tiw->fis = fis;
532
- tiw->curr_field = NULL;
533
- tiw->curr_field_num = -1;
534
-
535
- strcpy(fname, segment);
536
- strcat(fname, (is_index ? ".tii" : ".tis"));
537
- os = tiw->os = store->create_output(store, fname);
538
- os_write_int(os, TERM_INFO_FORMAT); /* write format */
539
- os_write_long(os, 0); /* leave space for size */
540
- os_write_int(os, tiw->index_interval); /* write index_interval */
541
- os_write_int(os, tiw->skip_interval); /* write skip_interval */
542
- if (!is_index) {
543
- tiw->other = tiw_open_internal(store, segment, fis, interval, true);
544
- tiw->other->other = tiw;
545
- }
546
- return tiw;
547
- }
548
-
549
- TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
550
- {
551
- return tiw_open_internal(store, segment, fis, interval, false);
552
- }
553
-
554
- void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
555
- {
556
- //printf("%s, %s\n", tiw->last_term->text, t->text);
557
- int start = hlp_string_diff(tiw->last_term->text, t->text);
558
- int length = (int)strlen(t->text) - start;
559
-
560
- os_write_vint(os, start); /* write shared prefix length */
561
- os_write_vint(os, length); /* write delta length */
562
- os_write_chars(os, t->text, start, length); /* write delta chars */
563
- if (tiw->curr_field != t->field) {
564
- tiw->curr_field = t->field;
565
- tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
566
- }
567
- os_write_vint(os, tiw->curr_field_num);
568
- tiw->last_term = t;
569
- }
570
-
571
- void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
572
- {
573
- #ifdef DEBUG
574
- if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
575
- RAISE(STATE_ERROR, TERM_ORDER_ERROR_MSG);
576
- }
577
- if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
578
- RAISE(STATE_ERROR, FP_ORDER_ERROR_MSG);
579
- }
580
- if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
581
- RAISE(STATE_ERROR, PP_ORDER_ERROR_MSG);
582
- }
583
- #endif
584
-
585
- if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0) {
586
- /* add an index term */
587
- tiw_add(tiw->other, tiw->last_term, tiw->last_term_info);
588
- }
589
-
590
- tiw_write_term(tiw, tiw->os, t); /* write term */
591
- os_write_vint(tiw->os, ti->doc_freq); /* write doc freq */
592
- os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
593
- os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
594
-
595
- if (ti->doc_freq >= tiw->skip_interval) {
596
- os_write_vint(tiw->os, ti->skip_offset);
597
- }
598
-
599
- if (tiw->is_index) {
600
- OutStream *other_os = tiw->other->os;
601
- int other_pos = os_pos(other_os);
602
- os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
603
- tiw->last_index_pointer = other_pos; /* write pointer */
604
- }
605
-
606
- ti_cpy(tiw->last_term_info, ti);
607
- tiw->size++;
608
- }
609
-
610
- void tiw_close(TermInfosWriter *tiw)
611
- {
612
- OutStream *os = tiw->os;
613
- os_seek(os, 4); /* write @size after format */
614
- os_write_long(os, tiw->size);
615
- os_close(os);
616
-
617
- if (!tiw->is_index)
618
- tiw_close(tiw->other);
619
-
620
- ti_destroy(tiw->last_term_info);
621
- free(tiw);
622
- }
623
-
624
- /****************************************************************************
625
- *
626
- * TermInfosReader
627
- *
628
- ****************************************************************************/
629
-
630
- void tir_close(TermInfosReader *tir)
631
- {
632
- int i;
633
- if (tir->index_terms != NULL) {
634
- for (i = 0; i < tir->index_size; i++) {
635
- term_destroy(tir->index_terms[i]);
636
- ti_destroy(tir->index_term_infos[i]);
637
- }
638
- free(tir->index_terms);
639
- free(tir->index_term_infos);
640
- free(tir->index_pointers);
641
- }
642
- if (tir->orig_te) tir->orig_te->close(tir->orig_te);
643
- thread_key_delete(tir->thread_te);
644
- ary_destroy(tir->te_bucket);
645
- if (tir->index_te) tir->index_te->close(tir->index_te);
646
- mutex_destroy(&tir->mutex);
647
- free(tir);
648
- }
649
-
650
- TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
651
- {
652
- SegmentTermEnum *ste;
653
- TermInfosReader *tir = ALLOC(TermInfosReader);
654
- char fname[SEGMENT_NAME_MAX_LENGTH];
655
- InStream *is;
656
-
657
- mutex_init(&tir->mutex, NULL);
658
- strcpy(fname, segment);
659
- strcpy(fname + strlen(segment), ".tis");
660
- is = store->open_input(store, fname);
661
- tir->orig_te = ste_create(is, fis, false);
662
- thread_key_create(&tir->thread_te, NULL);
663
- tir->te_bucket = ary_create(1, (free_ft)tir->orig_te->close);
664
-
665
- ste = tir->orig_te->data;
666
- tir->size = ste->size;
667
- tir->skip_interval = ste->skip_interval;
668
-
669
- strcpy(fname + strlen(segment), ".tii");
670
- is = store->open_input(store, fname);
671
- tir->index_te = ste_create(is, fis, true);
672
- tir->index_terms = NULL;
673
- tir->index_term_infos = NULL;
674
- tir->index_pointers = NULL;
675
- return tir;
676
- }
677
-
678
- void tir_ensure_index_is_read(TermInfosReader *tir)
679
- {
680
- mutex_lock(&tir->mutex);
681
- if (tir->index_terms == NULL) {
682
- TermEnum *index_te;
683
- SegmentTermEnum *ste;
684
- int i = 0;
685
- int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
686
- tir->index_size = index_size;
687
-
688
- tir->index_terms = ALLOC_N(Term *, index_size);
689
- tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
690
- tir->index_pointers = ALLOC_N(int, index_size);
691
-
692
- index_te = tir->index_te;
693
- ste = index_te->data;
694
-
695
- TRY
696
- while (ste_next(index_te) != NULL) {
697
- tir->index_terms[i] = te_get_term(index_te);
698
- tir->index_term_infos[i] = te_get_ti(index_te);
699
- tir->index_pointers[i] = ste->index_pointer;
700
- i++;
701
- }
702
- XFINALLY
703
- index_te->close(index_te);
704
- tir->index_te = NULL;
705
- XENDTRY
706
- }
707
- mutex_unlock(&tir->mutex);
708
- }
709
-
710
- static inline TermEnum *tir_enum(TermInfosReader *tir)
711
- {
712
- TermEnum *te;
713
- if ((te = thread_getspecific(tir->thread_te)) == NULL) {
714
- te = tir->orig_te->clone(tir->orig_te);
715
- ary_append(tir->te_bucket, te);
716
- thread_setspecific(tir->thread_te, te);
717
- }
718
- return te;
719
- }
720
-
721
- void tir_seek_enum(TermInfosReader *tir, int ind_offset)
722
- {
723
- TermEnum *te = tir_enum(tir);
724
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
725
- ste_seek(te, tir->index_pointers[ind_offset],
726
- (ind_offset * ste->index_interval) - 1,
727
- tir->index_terms[ind_offset],
728
- tir->index_term_infos[ind_offset]);
729
- }
730
-
731
- int tir_get_index_offset(TermInfosReader *tir, Term *t)
732
- {
733
- int lo = 0; /* binary search tir->index_terms[] */
734
- int hi = tir->index_size - 1;
735
- int mid, delta;
736
- Term **index_terms = tir->index_terms;
737
-
738
- while (hi >= lo) {
739
- mid = (lo + hi) >> 1;
740
- delta = term_cmp(t, index_terms[mid]);
741
- if (delta < 0) {
742
- hi = mid - 1;
743
- } else if (delta > 0) {
744
- lo = mid + 1;
745
- } else {
746
- return mid;
747
- }
748
- }
749
- return hi;
750
- }
751
-
752
- TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
753
- {
754
- TermEnum *te;
755
- SegmentTermEnum *ste;
756
- if (tir->size == 0) {
757
- return NULL;
758
- }
759
-
760
- tir_ensure_index_is_read(tir);
761
-
762
- /* optimize sequential access: first try scanning cached enum w/o seeking */
763
- te = tir_enum(tir);
764
- ste = (SegmentTermEnum *)te->data;
765
- if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
766
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
767
- int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
768
- if (tir->index_size == enum_offset ||
769
- term_cmp(t, tir->index_terms[enum_offset]) < 0) { /* but before end of block */
770
- return ste_scan_for_term_info(te, t); /* no need to seek */
771
- }
772
- }
773
-
774
- /* random-access: must seek */
775
- tir_seek_enum(tir, tir_get_index_offset(tir, t));
776
- return ste_scan_for_term_info(te, t);
777
- }
778
-
779
- Term *tir_get_term(TermInfosReader *tir, int pos)
780
- {
781
- if (tir->size == 0) {
782
- return NULL;
783
- } else {
784
- TermEnum *te = tir_enum(tir);
785
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
786
- if (pos >= ste->pos &&
787
- pos < (ste->pos + ste->index_interval)) {
788
- return ste_scan_for_term(te, pos); /* can avoid seek */
789
- }
790
-
791
- tir_seek_enum(tir, (int)(pos / ste->index_interval)); /* must seek */
792
- return ste_scan_for_term(te, pos);
793
- }
794
- }
795
-
796
- int tir_get_term_pos(TermInfosReader *tir, Term *t)
797
- {
798
- if (tir->size == 0) {
799
- return -1;
800
- } else {
801
- TermEnum *te;
802
- int ind_offset;
803
-
804
- tir_ensure_index_is_read(tir);
805
-
806
- ind_offset = tir_get_index_offset(tir, t);
807
- tir_seek_enum(tir, ind_offset);
808
-
809
- te = tir_enum(tir);
810
- while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL)) {
811
- }
812
-
813
- if (tb_term_cmp(te->tb_curr, t) == 0) {
814
- return ((SegmentTermEnum *)te->data)->pos;
815
- } else {
816
- return -1;
817
- }
818
- }
819
- }
820
-