ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_term.c DELETED
@@ -1,219 +0,0 @@
1
- #include "ferret.h"
2
- #include "index.h"
3
-
4
- VALUE cTerm;
5
- /****************************************************************************
6
- *
7
- * Term Methods
8
- *
9
- ****************************************************************************/
10
-
11
- typedef struct RTerm {
12
- VALUE field;
13
- VALUE text;
14
- } RTerm;
15
-
16
- void
17
- frt_term_mark(void *p)
18
- {
19
- RTerm *term = (RTerm *)p;
20
- rb_gc_mark(term->field);
21
- rb_gc_mark(term->text);
22
- }
23
-
24
- static VALUE
25
- frt_term_alloc(VALUE klass)
26
- {
27
- RTerm *term = ALLOC(RTerm);
28
- term->field = Qnil;
29
- term->text = Qnil;
30
- return Data_Wrap_Struct(klass, &frt_term_mark, &free, term);
31
- }
32
-
33
- #define GET_TERM RTerm *term; Data_Get_Struct(self, RTerm, term)
34
- static VALUE
35
- frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
36
- {
37
- GET_TERM;
38
- term->field = rb_obj_as_string(rfield);
39
- term->text = rb_obj_as_string(rtext);
40
- return self;
41
- }
42
-
43
- VALUE
44
- frt_get_rterm(char *field, char *text)
45
- {
46
- RTerm *rterm = ALLOC(RTerm);
47
- rterm->field = rb_str_new2(field);
48
- rterm->text = rb_str_new2(text);
49
- return Data_Wrap_Struct(cTerm, &frt_term_mark, &free, rterm);
50
- }
51
-
52
- Term *
53
- frt_set_term(VALUE self, Term *t)
54
- {
55
- GET_TERM;
56
- t->field = RSTRING(term->field)->ptr;
57
- t->text = RSTRING(term->text)->ptr;
58
- return t;
59
- }
60
-
61
- Term *
62
- frt_get_term(VALUE self)
63
- {
64
- Term *t = NULL;
65
- if (self != Qnil) {
66
- GET_TERM;
67
- t = ALLOC(Term);
68
- /* store text and field in text so that field will be freed with text */
69
- t->text = ALLOC_N(char, RSTRING(term->text)->len +
70
- RSTRING(term->field)->len + 2);
71
- sprintf(t->text, "%s %s", RSTRING(term->text)->ptr,
72
- RSTRING(term->field)->ptr);
73
- t->text[RSTRING(term->text)->len] = '\0';
74
- t->field = t->text + RSTRING(term->text)->len + 1;
75
- }
76
- return t;
77
- }
78
-
79
- static VALUE
80
- frt_term_get_text(VALUE self)
81
- {
82
- GET_TERM;
83
- return term->text;
84
- }
85
-
86
- static VALUE
87
- frt_term_set_text(VALUE self, VALUE rtext)
88
- {
89
- GET_TERM;
90
- term->text = rb_obj_as_string(rtext);
91
- return Qnil;
92
- }
93
-
94
- static VALUE
95
- frt_term_get_field(VALUE self)
96
- {
97
- GET_TERM;
98
- return term->field;
99
- }
100
-
101
- static VALUE
102
- frt_term_set_field(VALUE self, VALUE rfield)
103
- {
104
- GET_TERM;
105
- term->field = rb_obj_as_string(rfield);
106
- return Qnil;
107
- }
108
-
109
- VALUE
110
- frt_term_to_s(VALUE self)
111
- {
112
- int tlen, flen;
113
- char *res;
114
- GET_TERM;
115
- tlen = RSTRING(term->text)->len;
116
- flen = RSTRING(term->field)->len;
117
- res = alloca(flen + tlen + 1);
118
-
119
- MEMCPY(res, StringValuePtr(term->field), char, flen);
120
- res[flen] = ':';
121
- MEMCPY(res + flen + 1, StringValuePtr(term->text), char, tlen);
122
- return rb_str_new(res, tlen + flen + 1 );
123
- }
124
-
125
- inline int
126
- frt_term_cmp(RTerm *t1, RTerm *t2)
127
- {
128
- int comp = rb_str_cmp(t1->field, t2->field);
129
- if (comp == 0) {
130
- comp = rb_str_cmp(t1->text, t2->text);
131
- }
132
- return comp;
133
- }
134
-
135
- int
136
- frt_term_compare_to_int(VALUE self, VALUE rother)
137
- {
138
- RTerm *other;
139
- GET_TERM;
140
- Data_Get_Struct(rother, RTerm, other);
141
- return frt_term_cmp(term, other);
142
- }
143
-
144
- VALUE
145
- frt_term_lt(VALUE self, VALUE rother)
146
- {
147
- return frt_term_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
148
- }
149
-
150
- VALUE
151
- frt_term_gt(VALUE self, VALUE rother)
152
- {
153
- return frt_term_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
154
- }
155
-
156
- VALUE
157
- frt_term_le(VALUE self, VALUE rother)
158
- {
159
- return frt_term_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
160
- }
161
-
162
- VALUE
163
- frt_term_ge(VALUE self, VALUE rother)
164
- {
165
- return frt_term_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
166
- }
167
-
168
- VALUE
169
- frt_term_eq(VALUE self, VALUE rother)
170
- {
171
- if (rother == Qnil)
172
- return Qfalse;
173
- return frt_term_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
174
- }
175
-
176
-
177
- static VALUE
178
- frt_term_compare_to(VALUE self, VALUE other)
179
- {
180
- return INT2FIX(frt_term_compare_to_int(self, other));
181
- }
182
-
183
- static VALUE
184
- frt_term_hash(VALUE self)
185
- {
186
- GET_TERM;
187
- return INT2FIX(rb_str_hash(term->field) + rb_str_hash(term->text));
188
- }
189
-
190
- /****************************************************************************
191
- *
192
- * Init Function
193
- *
194
- ****************************************************************************/
195
-
196
- void
197
- Init_term(void)
198
- {
199
- /* Term */
200
- cTerm = rb_define_class_under(mIndex, "Term", rb_cObject);
201
- rb_define_alloc_func(cTerm, frt_term_alloc);
202
- rb_include_module(cTerm, rb_mComparable);
203
-
204
- rb_define_method(cTerm, "initialize", frt_term_init, 2);
205
- rb_define_method(cTerm, "set!", frt_term_init, 2);
206
- rb_define_method(cTerm, "to_s", frt_term_to_s, 0);
207
- rb_define_method(cTerm, "<=>", frt_term_compare_to, 1);
208
- rb_define_method(cTerm, "<", frt_term_lt, 1);
209
- rb_define_method(cTerm, ">", frt_term_gt, 1);
210
- rb_define_method(cTerm, "<=", frt_term_le, 1);
211
- rb_define_method(cTerm, ">=", frt_term_ge, 1);
212
- rb_define_method(cTerm, "eql?", frt_term_eq, 1);
213
- rb_define_method(cTerm, "==", frt_term_eq, 1);
214
- rb_define_method(cTerm, "text", frt_term_get_text, 0);
215
- rb_define_method(cTerm, "text=", frt_term_set_text, 1);
216
- rb_define_method(cTerm, "field", frt_term_get_field, 0);
217
- rb_define_method(cTerm, "field=", frt_term_set_field, 1);
218
- rb_define_method(cTerm, "hash", frt_term_hash, 0);
219
- }
data/ext/term.c DELETED
@@ -1,820 +0,0 @@
1
- #include "index.h"
2
- #include "helper.h"
3
- #include "hash.h"
4
- #include <string.h>
5
-
6
- static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
7
- static char * const TERM_ORDER_ERROR_MSG = "term out of order";
8
- static char * const FP_ORDER_ERROR_MSG = "freq pointer out of order";
9
- static char * const PP_ORDER_ERROR_MSG = "prox pointer out of order";
10
-
11
- /****************************************************************************
12
- *
13
- * Term
14
- *
15
- ****************************************************************************/
16
-
17
- Term *term_clone(Term *term)
18
- {
19
- Term *t = ALLOC(Term);
20
-
21
- t->field = term->field;
22
- t->text = estrdup(term->text);
23
- return t;
24
- }
25
-
26
- Term *term_create(const char *field, char *text)
27
- {
28
- Term *t = ALLOC(Term);
29
-
30
- t->field = (char *)field;
31
- t->text = estrdup(text);
32
- return t;
33
- }
34
-
35
- void term_destroy(Term *self)
36
- {
37
- free(self->text);
38
- free(self);
39
- }
40
-
41
- int term_cmp(void *t1, void *t2)
42
- {
43
- int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
44
- if (res != 0) {
45
- return res;
46
- } else {
47
- return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
48
- }
49
- }
50
-
51
- int term_eq(const void *t1, const void *t2)
52
- {
53
- return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
54
- (strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
55
- }
56
-
57
- unsigned int term_hash(const void *t)
58
- {
59
- return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
60
- }
61
-
62
- char *term_to_s(Term *term)
63
- {
64
- char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
65
- sprintf(string, "%s:%s", term->field, term->text);
66
- return string;
67
- }
68
-
69
- /****************************************************************************
70
- *
71
- * TermBuffer
72
- *
73
- ****************************************************************************/
74
-
75
- void tb_reset(TermBuffer *tb)
76
- {
77
- tb->field = (char *)EMPTY_STRING;
78
- tb->text[0] = '\0';
79
- }
80
-
81
- TermBuffer *tb_create()
82
- {
83
- TermBuffer *tb = ALLOC(TermBuffer);
84
- tb->field = (char *)EMPTY_STRING;
85
- tb->text[0] = '\0';
86
- return tb;
87
- }
88
-
89
- void tb_destroy(TermBuffer *tb)
90
- {
91
- free(tb);
92
- }
93
-
94
- TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
95
- {
96
- tb->field = t->field;
97
- strcpy(tb->text, t->text);
98
- return tb;
99
- }
100
-
101
- Term *tb_get_term(TermBuffer *tb)
102
- {
103
- return term_create(tb->field, tb->text);
104
- }
105
-
106
- int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
107
- {
108
- int res;
109
- if ((tb1->field != tb2->field) &&
110
- (0 != (res = strcmp(tb1->field, tb2->field)))) {
111
- return res;
112
- } else {
113
- return strcmp(tb1->text, tb2->text);
114
- }
115
- }
116
-
117
- int tb_term_cmp(TermBuffer *tb, Term *t)
118
- {
119
- int res = strcmp(tb->field, t->field);
120
- if (res != 0) {
121
- return res;
122
- } else {
123
- return strcmp(tb->text, t->text);
124
- }
125
- }
126
-
127
- TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
128
- {
129
- tb1->field = tb2->field;
130
- strcpy(tb1->text, tb2->text);
131
- return tb1;
132
- }
133
-
134
- TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
135
- {
136
- signed int fnum;
137
- int start = (int)is_read_vint(is);
138
- int length = (int)is_read_vint(is);
139
- int total_length = start + length;
140
- is_read_bytes(is, (uchar *)tb->text, start, length);
141
- tb->text[total_length] = '\0';
142
- fnum = (signed int)is_read_vint(is);
143
- if (fnum < 0)
144
- tb->field = (char *)EMPTY_STRING;
145
- else
146
- tb->field = fis->by_number[fnum]->name;
147
- return tb;
148
- }
149
-
150
- /****************************************************************************
151
- *
152
- * TermInfo
153
- *
154
- ****************************************************************************/
155
-
156
- TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
157
- {
158
- TermInfo *ti = ALLOC(TermInfo);
159
- ti->doc_freq = doc_freq;
160
- ti->freq_pointer = freq_pointer;
161
- ti->prox_pointer = prox_pointer;
162
- ti->skip_offset = skip_offset;
163
- return ti;
164
- }
165
-
166
- TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
167
- {
168
- ti->doc_freq = doc_freq;
169
- ti->freq_pointer = freq_pointer;
170
- ti->prox_pointer = prox_pointer;
171
- ti->skip_offset = skip_offset;
172
- return ti;
173
- }
174
-
175
- void ti_destroy(TermInfo *ti)
176
- {
177
- free(ti);
178
- }
179
-
180
- TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
181
- {
182
- memcpy(ti, other, sizeof(TermInfo));
183
- return ti;
184
- }
185
-
186
- TermInfo *ti_clone(TermInfo *other)
187
- {
188
- return ti_create(other->doc_freq,
189
- other->freq_pointer, other->prox_pointer, other->skip_offset);
190
- }
191
-
192
- int ti_eq(TermInfo *ti, TermInfo *other)
193
- {
194
- return (memcmp(ti, other, sizeof(TermInfo)) == 0);
195
- }
196
-
197
- /****************************************************************************
198
- *
199
- * TermEnum
200
- *
201
- ****************************************************************************/
202
-
203
- TermEnum *te_create()
204
- {
205
- TermEnum *te = ALLOC(TermEnum);
206
- te->tb_curr = tb_create();
207
- te->tb_prev = tb_create();
208
- te->ti_curr = ti_create(0, 0, 0, 0);
209
- return te;
210
- }
211
-
212
- void te_destroy(TermEnum *te)
213
- {
214
- tb_destroy(te->tb_curr);
215
- tb_destroy(te->tb_prev);
216
- ti_destroy(te->ti_curr);
217
- free(te);
218
- }
219
-
220
- Term *te_get_term(TermEnum *te)
221
- {
222
- return tb_get_term(te->tb_curr);
223
- }
224
-
225
- TermInfo *te_get_ti(TermEnum *te)
226
- {
227
- TermInfo *ti = te->ti_curr;
228
- return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
229
- }
230
-
231
- TermBuffer *te_skip_to(TermEnum *te, Term *t)
232
- {
233
- TermBuffer *tb_curr;
234
- if (tb_term_cmp(te->tb_curr, t) == 0)
235
- return te->tb_curr;
236
-
237
- while (((tb_curr = te->next(te)) != NULL) &&
238
- (tb_term_cmp(tb_curr, t) < 0)) {
239
- }
240
- return tb_curr;
241
- }
242
-
243
- /****************************************************************************
244
- *
245
- * SegmentTermEnum
246
- *
247
- ****************************************************************************/
248
-
249
- #define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data
250
-
251
- TermBuffer *ste_next(TermEnum *te)
252
- {
253
- GET_STE;
254
- TermInfo *ti;
255
- InStream *is = ste->is;
256
- ste->pos++;
257
- if (ste->pos > ste->size - 1) {
258
- tb_reset(te->tb_curr);
259
- return NULL;
260
- }
261
-
262
- tb_cpy(te->tb_prev, te->tb_curr);
263
- tb_read(te->tb_curr, is, ste->fis);
264
-
265
- ti = te->ti_curr;
266
- ti->doc_freq = (int)is_read_vint(is); /* read doc freq */
267
- ti->freq_pointer += (int)is_read_vint(is);/* read freq pointer */
268
- ti->prox_pointer += (int)is_read_vint(is);/* read prox pointer */
269
-
270
- if (ste->format == -1) {
271
- /* just read skip_offset in order to increment file pointer
272
- * value is never used since skip_to is switched off */
273
- if (!ste->is_index) {
274
- if (ti->doc_freq > ste->format_m1skip_interval) {
275
- ti->skip_offset = (int)is_read_vint(is);
276
- }
277
- }
278
- } else {
279
- if (ti->doc_freq >= ste->skip_interval) {
280
- ti->skip_offset = (int)is_read_vint(is);
281
- }
282
- }
283
-
284
- if (ste->is_index) {
285
- ste->index_pointer += (int)is_read_vint(is); /* read index pointer */
286
- }
287
-
288
- return te->tb_curr;
289
- }
290
-
291
- TermEnum *ste_clone(TermEnum *other_te);
292
-
293
- TermEnum *ste_allocate()
294
- {
295
- TermEnum *te = te_create();
296
- SegmentTermEnum *ste;
297
-
298
- te->next = &ste_next;
299
- te->close = &ste_close;
300
- te->clone = &ste_clone;
301
- ste = ALLOC(SegmentTermEnum);
302
- te->data = ste;
303
- return te;
304
- }
305
-
306
- TermEnum *ste_clone(TermEnum *other_te)
307
- {
308
- SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
309
- TermEnum *te = ste_allocate();
310
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
311
-
312
- memcpy(ste, other_ste, sizeof(SegmentTermEnum));
313
- ste->is = is_clone(other_ste->is);
314
- tb_cpy(te->tb_curr, other_te->tb_curr);
315
- tb_cpy(te->tb_prev, other_te->tb_prev);
316
- ti_cpy(te->ti_curr, other_te->ti_curr);
317
- return te;
318
- }
319
-
320
- void ste_close(TermEnum *te)
321
- {
322
- GET_STE;
323
- is_close(ste->is);
324
- free(ste);
325
- te->data = NULL;
326
- te_destroy(te);
327
- }
328
-
329
- TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
330
- {
331
- TermEnum *te = ste_allocate();
332
- GET_STE;
333
- int first_int;
334
-
335
- ste->fis = fis;
336
- ste->is_index = is_index;
337
- ste->is = is;
338
- ste->pos = -1;
339
- ste->index_pointer = 0;
340
- ste->format_m1skip_interval = -1;
341
-
342
- first_int = (int)is_read_int(is);
343
-
344
- if (first_int >= 0) {
345
- /* original-format file, without explicit format version number */
346
- ste->format = 0;
347
- ste->size = first_int;
348
-
349
- /* back-compatible settings */
350
- ste->index_interval = 128;
351
- ste->skip_interval = INT_MAX; /* switch off skip_to optimization */
352
-
353
- } else {
354
- /* check that it is a format we can understand */
355
- if (first_int < TERM_INFO_FORMAT)
356
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
357
-
358
- /* we have a format version number */
359
- ste->format = first_int;
360
-
361
-
362
- ste->size = (int)is_read_long(is); /* read the size */
363
-
364
- if (ste->format == -1) {
365
- if (!ste->is_index) {
366
- ste->index_interval = is_read_int(is);
367
- ste->format_m1skip_interval = is_read_int(is);
368
- }
369
- /* switch off skip_to optimization for file format prior to
370
- * 1.4rc2 in order to avoid a bug in skip_to implementation
371
- * of these versions */
372
- ste->skip_interval = INT_MAX;
373
- } else {
374
- ste->index_interval = is_read_int(is);
375
- ste->skip_interval = is_read_int(is);
376
- }
377
- }
378
- return te;
379
- }
380
-
381
- void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
382
- {
383
- GET_STE;
384
- is_seek(ste->is, pointer);
385
- ste->pos = pos;
386
- tb_set_term(te->tb_curr, t);
387
- tb_reset(te->tb_prev);
388
- ti_cpy(te->ti_curr, ti);
389
- }
390
-
391
- TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
392
- {
393
- te_skip_to(te, t);
394
-
395
- if (tb_term_cmp(te->tb_curr, t) == 0) {
396
- return te_get_ti(te);
397
- } else {
398
- return NULL;
399
- }
400
- }
401
-
402
- Term *ste_scan_for_term(TermEnum *te, int pos)
403
- {
404
- GET_STE;
405
- while (ste->pos < pos) {
406
- if (ste_next(te) == NULL)
407
- return NULL;
408
- }
409
-
410
- return te_get_term(te);
411
- }
412
-
413
- /****************************************************************************
414
- *
415
- * MultiTermEnum
416
- *
417
- ****************************************************************************/
418
-
419
- #define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data
420
-
421
- TermBuffer *mte_next(TermEnum *te)
422
- {
423
- GET_MTE;
424
- SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
425
-
426
- if (top == NULL) {
427
- tb_reset(te->tb_curr);
428
- return false;
429
- }
430
-
431
- tb_cpy(te->tb_prev, te->tb_curr);
432
- tb_cpy(te->tb_curr, top->tb);
433
-
434
- te->ti_curr->doc_freq = 0;
435
-
436
- while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
437
- pq_pop(mte->smi_queue);
438
- te->ti_curr->doc_freq += top->te->ti_curr->doc_freq;/* increment freq */
439
- if (smi_next(top)) {
440
- pq_push(mte->smi_queue, top); /* restore queue */
441
- } else {
442
- smi_destroy(top); /* done with a segment */
443
- }
444
- top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
445
- }
446
- return te->tb_curr;
447
- }
448
-
449
- void mte_close(TermEnum *te)
450
- {
451
- GET_MTE;
452
- pq_clear(mte->smi_queue);
453
- pq_destroy(mte->smi_queue);
454
- free(mte);
455
- te_destroy(te);
456
- }
457
-
458
- TermEnum *mte_clone(TermEnum *te)
459
- {
460
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
461
- return NULL;
462
- }
463
-
464
- TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
465
- {
466
- int i;
467
- IndexReader *reader;
468
- TermEnum *sub_te;
469
- MultiTermEnum *mte = ALLOC(MultiTermEnum);
470
- TermEnum *te = te_create();
471
- te->next = &mte_next;
472
- te->clone = &mte_clone;
473
- te->close = &mte_close;
474
-
475
- te->data = mte;
476
-
477
- mte->smi_queue = pq_create(rcnt, (lt_ft)&smi_lt);
478
- mte->smi_queue->free_elem = (free_ft)&smi_destroy;
479
-
480
- for (i = 0; i < rcnt; i++) {
481
- SegmentMergeInfo *smi;
482
- reader = readers[i];
483
-
484
- if (t != NULL) {
485
- sub_te = reader->terms_from(reader, t);
486
- } else {
487
- sub_te = reader->terms(reader);
488
- }
489
-
490
- smi = smi_create(starts[i], sub_te, reader);
491
- if (((t == NULL) && smi_next(smi)) ||
492
- (sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
493
- pq_push(mte->smi_queue, smi); /* initialize queue */
494
- } else {
495
- smi_destroy(smi);
496
- }
497
- }
498
-
499
- if ((t != NULL) && (mte->smi_queue->count > 0)) {
500
- mte_next(te);
501
- }
502
-
503
- return te;
504
- }
505
-
506
- /****************************************************************************
507
- *
508
- * TermInfosWriter
509
- *
510
- ****************************************************************************/
511
-
512
- const Term EmptyTerm = {"", ""};
513
-
514
- TermInfosWriter *tiw_open_internal(Store *store,
515
- char *segment,
516
- FieldInfos *fis,
517
- int interval,
518
- int is_index)
519
- {
520
- char fname[SEGMENT_NAME_MAX_LENGTH];
521
- TermInfosWriter *tiw = ALLOC(TermInfosWriter);
522
- OutStream *os;
523
-
524
- tiw->index_interval = interval;
525
- tiw->skip_interval = 16;
526
- tiw->last_index_pointer = 0;
527
- tiw->last_term = (Term *)&EmptyTerm;
528
- tiw->last_term_info = ti_create(0,0,0,0);
529
- tiw->size = 0;
530
- tiw->is_index = is_index;
531
- tiw->fis = fis;
532
- tiw->curr_field = NULL;
533
- tiw->curr_field_num = -1;
534
-
535
- strcpy(fname, segment);
536
- strcat(fname, (is_index ? ".tii" : ".tis"));
537
- os = tiw->os = store->create_output(store, fname);
538
- os_write_int(os, TERM_INFO_FORMAT); /* write format */
539
- os_write_long(os, 0); /* leave space for size */
540
- os_write_int(os, tiw->index_interval); /* write index_interval */
541
- os_write_int(os, tiw->skip_interval); /* write skip_interval */
542
- if (!is_index) {
543
- tiw->other = tiw_open_internal(store, segment, fis, interval, true);
544
- tiw->other->other = tiw;
545
- }
546
- return tiw;
547
- }
548
-
549
- TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
550
- {
551
- return tiw_open_internal(store, segment, fis, interval, false);
552
- }
553
-
554
- void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
555
- {
556
- //printf("%s, %s\n", tiw->last_term->text, t->text);
557
- int start = hlp_string_diff(tiw->last_term->text, t->text);
558
- int length = (int)strlen(t->text) - start;
559
-
560
- os_write_vint(os, start); /* write shared prefix length */
561
- os_write_vint(os, length); /* write delta length */
562
- os_write_chars(os, t->text, start, length); /* write delta chars */
563
- if (tiw->curr_field != t->field) {
564
- tiw->curr_field = t->field;
565
- tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
566
- }
567
- os_write_vint(os, tiw->curr_field_num);
568
- tiw->last_term = t;
569
- }
570
-
571
- void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
572
- {
573
- #ifdef DEBUG
574
- if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
575
- RAISE(STATE_ERROR, TERM_ORDER_ERROR_MSG);
576
- }
577
- if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
578
- RAISE(STATE_ERROR, FP_ORDER_ERROR_MSG);
579
- }
580
- if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
581
- RAISE(STATE_ERROR, PP_ORDER_ERROR_MSG);
582
- }
583
- #endif
584
-
585
- if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0) {
586
- /* add an index term */
587
- tiw_add(tiw->other, tiw->last_term, tiw->last_term_info);
588
- }
589
-
590
- tiw_write_term(tiw, tiw->os, t); /* write term */
591
- os_write_vint(tiw->os, ti->doc_freq); /* write doc freq */
592
- os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
593
- os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
594
-
595
- if (ti->doc_freq >= tiw->skip_interval) {
596
- os_write_vint(tiw->os, ti->skip_offset);
597
- }
598
-
599
- if (tiw->is_index) {
600
- OutStream *other_os = tiw->other->os;
601
- int other_pos = os_pos(other_os);
602
- os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
603
- tiw->last_index_pointer = other_pos; /* write pointer */
604
- }
605
-
606
- ti_cpy(tiw->last_term_info, ti);
607
- tiw->size++;
608
- }
609
-
610
- void tiw_close(TermInfosWriter *tiw)
611
- {
612
- OutStream *os = tiw->os;
613
- os_seek(os, 4); /* write @size after format */
614
- os_write_long(os, tiw->size);
615
- os_close(os);
616
-
617
- if (!tiw->is_index)
618
- tiw_close(tiw->other);
619
-
620
- ti_destroy(tiw->last_term_info);
621
- free(tiw);
622
- }
623
-
624
- /****************************************************************************
625
- *
626
- * TermInfosReader
627
- *
628
- ****************************************************************************/
629
-
630
- void tir_close(TermInfosReader *tir)
631
- {
632
- int i;
633
- if (tir->index_terms != NULL) {
634
- for (i = 0; i < tir->index_size; i++) {
635
- term_destroy(tir->index_terms[i]);
636
- ti_destroy(tir->index_term_infos[i]);
637
- }
638
- free(tir->index_terms);
639
- free(tir->index_term_infos);
640
- free(tir->index_pointers);
641
- }
642
- if (tir->orig_te) tir->orig_te->close(tir->orig_te);
643
- thread_key_delete(tir->thread_te);
644
- ary_destroy(tir->te_bucket);
645
- if (tir->index_te) tir->index_te->close(tir->index_te);
646
- mutex_destroy(&tir->mutex);
647
- free(tir);
648
- }
649
-
650
- TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
651
- {
652
- SegmentTermEnum *ste;
653
- TermInfosReader *tir = ALLOC(TermInfosReader);
654
- char fname[SEGMENT_NAME_MAX_LENGTH];
655
- InStream *is;
656
-
657
- mutex_init(&tir->mutex, NULL);
658
- strcpy(fname, segment);
659
- strcpy(fname + strlen(segment), ".tis");
660
- is = store->open_input(store, fname);
661
- tir->orig_te = ste_create(is, fis, false);
662
- thread_key_create(&tir->thread_te, NULL);
663
- tir->te_bucket = ary_create(1, (free_ft)tir->orig_te->close);
664
-
665
- ste = tir->orig_te->data;
666
- tir->size = ste->size;
667
- tir->skip_interval = ste->skip_interval;
668
-
669
- strcpy(fname + strlen(segment), ".tii");
670
- is = store->open_input(store, fname);
671
- tir->index_te = ste_create(is, fis, true);
672
- tir->index_terms = NULL;
673
- tir->index_term_infos = NULL;
674
- tir->index_pointers = NULL;
675
- return tir;
676
- }
677
-
678
- void tir_ensure_index_is_read(TermInfosReader *tir)
679
- {
680
- mutex_lock(&tir->mutex);
681
- if (tir->index_terms == NULL) {
682
- TermEnum *index_te;
683
- SegmentTermEnum *ste;
684
- int i = 0;
685
- int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
686
- tir->index_size = index_size;
687
-
688
- tir->index_terms = ALLOC_N(Term *, index_size);
689
- tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
690
- tir->index_pointers = ALLOC_N(int, index_size);
691
-
692
- index_te = tir->index_te;
693
- ste = index_te->data;
694
-
695
- TRY
696
- while (ste_next(index_te) != NULL) {
697
- tir->index_terms[i] = te_get_term(index_te);
698
- tir->index_term_infos[i] = te_get_ti(index_te);
699
- tir->index_pointers[i] = ste->index_pointer;
700
- i++;
701
- }
702
- XFINALLY
703
- index_te->close(index_te);
704
- tir->index_te = NULL;
705
- XENDTRY
706
- }
707
- mutex_unlock(&tir->mutex);
708
- }
709
-
710
- static inline TermEnum *tir_enum(TermInfosReader *tir)
711
- {
712
- TermEnum *te;
713
- if ((te = thread_getspecific(tir->thread_te)) == NULL) {
714
- te = tir->orig_te->clone(tir->orig_te);
715
- ary_append(tir->te_bucket, te);
716
- thread_setspecific(tir->thread_te, te);
717
- }
718
- return te;
719
- }
720
-
721
- void tir_seek_enum(TermInfosReader *tir, int ind_offset)
722
- {
723
- TermEnum *te = tir_enum(tir);
724
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
725
- ste_seek(te, tir->index_pointers[ind_offset],
726
- (ind_offset * ste->index_interval) - 1,
727
- tir->index_terms[ind_offset],
728
- tir->index_term_infos[ind_offset]);
729
- }
730
-
731
- int tir_get_index_offset(TermInfosReader *tir, Term *t)
732
- {
733
- int lo = 0; /* binary search tir->index_terms[] */
734
- int hi = tir->index_size - 1;
735
- int mid, delta;
736
- Term **index_terms = tir->index_terms;
737
-
738
- while (hi >= lo) {
739
- mid = (lo + hi) >> 1;
740
- delta = term_cmp(t, index_terms[mid]);
741
- if (delta < 0) {
742
- hi = mid - 1;
743
- } else if (delta > 0) {
744
- lo = mid + 1;
745
- } else {
746
- return mid;
747
- }
748
- }
749
- return hi;
750
- }
751
-
752
- TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
753
- {
754
- TermEnum *te;
755
- SegmentTermEnum *ste;
756
- if (tir->size == 0) {
757
- return NULL;
758
- }
759
-
760
- tir_ensure_index_is_read(tir);
761
-
762
- /* optimize sequential access: first try scanning cached enum w/o seeking */
763
- te = tir_enum(tir);
764
- ste = (SegmentTermEnum *)te->data;
765
- if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
766
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
767
- int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
768
- if (tir->index_size == enum_offset ||
769
- term_cmp(t, tir->index_terms[enum_offset]) < 0) { /* but before end of block */
770
- return ste_scan_for_term_info(te, t); /* no need to seek */
771
- }
772
- }
773
-
774
- /* random-access: must seek */
775
- tir_seek_enum(tir, tir_get_index_offset(tir, t));
776
- return ste_scan_for_term_info(te, t);
777
- }
778
-
779
- Term *tir_get_term(TermInfosReader *tir, int pos)
780
- {
781
- if (tir->size == 0) {
782
- return NULL;
783
- } else {
784
- TermEnum *te = tir_enum(tir);
785
- SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
786
- if (pos >= ste->pos &&
787
- pos < (ste->pos + ste->index_interval)) {
788
- return ste_scan_for_term(te, pos); /* can avoid seek */
789
- }
790
-
791
- tir_seek_enum(tir, (int)(pos / ste->index_interval)); /* must seek */
792
- return ste_scan_for_term(te, pos);
793
- }
794
- }
795
-
796
- int tir_get_term_pos(TermInfosReader *tir, Term *t)
797
- {
798
- if (tir->size == 0) {
799
- return -1;
800
- } else {
801
- TermEnum *te;
802
- int ind_offset;
803
-
804
- tir_ensure_index_is_read(tir);
805
-
806
- ind_offset = tir_get_index_offset(tir, t);
807
- tir_seek_enum(tir, ind_offset);
808
-
809
- te = tir_enum(tir);
810
- while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL)) {
811
- }
812
-
813
- if (tb_term_cmp(te->tb_curr, t) == 0) {
814
- return ((SegmentTermEnum *)te->data)->pos;
815
- } else {
816
- return -1;
817
- }
818
- }
819
- }
820
-