ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/index_io.c DELETED
@@ -1,382 +0,0 @@
1
- #include <store.h>
2
- #include <string.h>
3
-
4
- static char * const STORE_EOF_ERROR_MSG = "EOF Error when trying to refill";
5
-
6
- Buffer *buf_create()
7
- {
8
- Buffer *buf = ALLOC(Buffer);
9
- buf->start = 0;
10
- buf->pos = 0;
11
- buf->len = 0;
12
- return buf;
13
- }
14
-
15
- void buf_destroy(Buffer *buf)
16
- {
17
- free(buf);
18
- }
19
-
20
- OutStream *os_create()
21
- {
22
- OutStream *os = ALLOC(OutStream);
23
- os->buf.start = 0;
24
- os->buf.pos = 0;
25
- os->buf.len = 0;
26
- return os;
27
- }
28
-
29
- inline void os_flush(OutStream *os)
30
- {
31
- os->flush_internal(os, os->buf.buf, os->buf.pos);
32
- os->buf.start += os->buf.pos;
33
- os->buf.pos = 0;
34
- }
35
-
36
- void os_close(OutStream *os)
37
- {
38
- os_flush(os);
39
- os->close_internal(os);
40
- free(os);
41
- }
42
-
43
- int os_pos(OutStream *os)
44
- {
45
- return os->buf.start + os->buf.pos;
46
- }
47
-
48
- void os_seek(OutStream *os, int new_pos)
49
- {
50
- os_flush(os);
51
- os->buf.start = new_pos;
52
- os->seek_internal(os, new_pos);
53
- }
54
-
55
- #define write_byte(os, b) os->buf.buf[os->buf.pos++] = b
56
-
57
- inline void os_write_byte(OutStream *os, uchar b)
58
- {
59
- if (os->buf.pos >= BUFFER_SIZE) {
60
- os_flush(os);
61
- }
62
- write_byte(os, b);
63
- }
64
-
65
- void os_write_bytes(OutStream *os, uchar *b, int len)
66
- {
67
- if (os->buf.pos > 0) { /* flush buffer */
68
- os_flush(os);
69
- }
70
-
71
- if (len < BUFFER_SIZE) {
72
- os->flush_internal(os, b, len);
73
- os->buf.start += len;
74
- } else {
75
- int pos = 0;
76
- int size;
77
- while (pos < len) {
78
- if (len - pos < BUFFER_SIZE) {
79
- size = len - pos;
80
- } else {
81
- size = BUFFER_SIZE;
82
- }
83
- os->flush_internal(os, b + pos, size);
84
- pos += size;
85
- os->buf.start += size;
86
- }
87
- }
88
- }
89
-
90
- InStream *is_create()
91
- {
92
- InStream *is = ALLOC(InStream);
93
- is->buf.start = 0;
94
- is->buf.pos = 0;
95
- is->buf.len = 0;
96
- return is;
97
- }
98
-
99
- void is_refill(InStream *is)
100
- {
101
- int start = is->buf.start + is->buf.pos;
102
- int last = start + BUFFER_SIZE;
103
- int flen = is->length_internal(is);
104
- if (last > flen) { /* don't read past EOF */
105
- last = flen;
106
- }
107
-
108
- is->buf.len = last - start;
109
- if (is->buf.len <= 0) {
110
- RAISE(EOF_ERROR, STORE_EOF_ERROR_MSG);
111
- }
112
-
113
- is->read_internal(is, is->buf.buf, 0, is->buf.len);
114
-
115
- is->buf.start = start;
116
- is->buf.pos = 0;
117
- }
118
-
119
- #define read_byte(is) is->buf.buf[is->buf.pos++]
120
- inline uchar is_read_byte(InStream *is)
121
- {
122
- if (is->buf.pos >= is->buf.len) {
123
- is_refill(is);
124
- }
125
-
126
- return read_byte(is);
127
- }
128
-
129
- int is_pos(InStream *is)
130
- {
131
- return is->buf.start + is->buf.pos;
132
- }
133
-
134
- uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len)
135
- {
136
- int i, start;
137
- if ((offset + len) < BUFFER_SIZE) {
138
- for (i = offset; i < offset + len; i++) {
139
- b[i] = is_read_byte(is);
140
- }
141
- } else { // read all-at-once
142
- start = is_pos(is);
143
- is->seek_internal(is, start);
144
- is->read_internal(is, b, offset, len);
145
-
146
- is->buf.start = start + len; // adjust stream variables
147
- is->buf.pos = 0;
148
- is->buf.len = 0; // trigger refill on read
149
- }
150
- return b;
151
- }
152
-
153
- void is_seek(InStream *is, int pos)
154
- {
155
- if (pos >= is->buf.start && pos < (is->buf.start + is->buf.len)) {
156
- is->buf.pos = pos - is->buf.start; // seek within buffer
157
- } else {
158
- is->buf.start = pos;
159
- is->buf.pos = 0;
160
- is->buf.len = 0; // trigger refill() on read()
161
- is->seek_internal(is, pos);
162
- }
163
- }
164
-
165
- void is_close(InStream *is)
166
- {
167
- is->close_internal(is);
168
- free(is);
169
- }
170
-
171
- InStream *is_clone(InStream *is)
172
- {
173
- InStream *new_index_i = ALLOC(InStream);
174
- memcpy(new_index_i, is, sizeof(InStream));
175
- new_index_i->is_clone = true;
176
- is->clone_internal(is, new_index_i);
177
- return new_index_i;
178
- }
179
-
180
- int
181
- is_read_int(InStream *is)
182
- {
183
- return ((int)is_read_byte(is) << 24) |
184
- ((int)is_read_byte(is) << 16) |
185
- ((int)is_read_byte(is) << 8) |
186
- (int)is_read_byte(is);
187
- }
188
-
189
- llong
190
- is_read_long(InStream *is)
191
- {
192
- return ((llong)is_read_byte(is) << 56) |
193
- ((llong)is_read_byte(is) << 48) |
194
- ((llong)is_read_byte(is) << 40) |
195
- ((llong)is_read_byte(is) << 32) |
196
- ((llong)is_read_byte(is) << 24) |
197
- ((llong)is_read_byte(is) << 16) |
198
- ((llong)is_read_byte(is) << 8) |
199
- (llong)is_read_byte(is);
200
- }
201
-
202
- unsigned int
203
- is_read_uint(InStream *is)
204
- {
205
- return ((unsigned int)is_read_byte(is) << 24) |
206
- ((unsigned int)is_read_byte(is) << 16) |
207
- ((unsigned int)is_read_byte(is) << 8) |
208
- (unsigned int)is_read_byte(is);
209
- }
210
-
211
- ullong
212
- is_read_ulong(InStream *is)
213
- {
214
- return ((ullong)is_read_byte(is) << 56) |
215
- ((ullong)is_read_byte(is) << 48) |
216
- ((ullong)is_read_byte(is) << 40) |
217
- ((ullong)is_read_byte(is) << 32) |
218
- ((ullong)is_read_byte(is) << 24) |
219
- ((ullong)is_read_byte(is) << 16) |
220
- ((ullong)is_read_byte(is) << 8) |
221
- (ullong)is_read_byte(is);
222
- }
223
-
224
- /* optimized to use unchecked read_byte if there is definitely space */
225
- inline ullong
226
- is_read_vint(InStream *is)
227
- {
228
- register ullong res, b;
229
- register int shift = 7;
230
-
231
- if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
232
- b = is_read_byte(is);
233
- res = b & 0x7F; // 0x7F = 0b01111111
234
-
235
- while ((b & 0x80) != 0) {// 0x80 = 0b10000000
236
- b = is_read_byte(is);
237
- res |= (b & 0x7F) << shift;
238
- shift += 7;
239
- }
240
- } else { // unchecked
241
- b = read_byte(is);
242
- res = b & 0x7F; // 0x7F = 0b01111111
243
-
244
- while ((b & 0x80) != 0) {// 0x80 = 0b10000000
245
- b = read_byte(is);
246
- res |= (b & 0x7F) << shift;
247
- shift += 7;
248
- }
249
- }
250
-
251
- return res;
252
- }
253
-
254
- inline void
255
- is_skip_vints(InStream *is, register int cnt)
256
- {
257
- for (; cnt > 0; cnt--) {
258
- while ((is_read_byte(is) & 0x80) != 0) {
259
- }
260
- }
261
- }
262
-
263
- inline void
264
- is_read_chars(InStream *is, char* buffer, int off, int len)
265
- {
266
- int end, i;
267
-
268
- end = off + len;
269
-
270
- for(i = off; i < end; i++) {
271
- buffer[i] = is_read_byte(is);
272
- }
273
- }
274
-
275
- char *
276
- is_read_string(InStream *is)
277
- {
278
- register int length = (int)is_read_vint(is);
279
- char *str = ALLOC_N(char, length + 1);
280
- str[length] = '\0';
281
-
282
- if (is->buf.pos > (is->buf.len - length)) {
283
- register int i;
284
- for(i = 0; i < length; i++) {
285
- str[i] = is_read_byte(is);
286
- }
287
- } else { // unchecked
288
- memcpy(str, is->buf.buf + is->buf.pos, length);
289
- is->buf.pos += length;
290
- }
291
- //is_read_chars(is, str, 0, length);
292
-
293
- return str;
294
- }
295
-
296
- void
297
- os_write_int(OutStream *os, int l)
298
- {
299
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
300
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
301
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
302
- os_write_byte(os, (uchar)(l & 0xFF));
303
- }
304
-
305
- void
306
- os_write_long(OutStream *os, llong l)
307
- {
308
- os_write_byte(os, (uchar)((l >> 56) & 0xFF));
309
- os_write_byte(os, (uchar)((l >> 48) & 0xFF));
310
- os_write_byte(os, (uchar)((l >> 40) & 0xFF));
311
- os_write_byte(os, (uchar)((l >> 32) & 0xFF));
312
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
313
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
314
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
315
- os_write_byte(os, (uchar)(l & 0xFF));
316
- }
317
-
318
- void
319
- os_write_uint(OutStream *os, unsigned int l)
320
- {
321
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
322
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
323
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
324
- os_write_byte(os, (uchar)(l & 0xFF));
325
- }
326
-
327
- void
328
- os_write_ulong(OutStream *os, ullong l)
329
- {
330
- os_write_byte(os, (uchar)((l >> 56) & 0xFF));
331
- os_write_byte(os, (uchar)((l >> 48) & 0xFF));
332
- os_write_byte(os, (uchar)((l >> 40) & 0xFF));
333
- os_write_byte(os, (uchar)((l >> 32) & 0xFF));
334
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
335
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
336
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
337
- os_write_byte(os, (uchar)(l & 0xFF));
338
- }
339
-
340
- /* optimized to use an unchecked write if there is space */
341
- inline void
342
- os_write_vint(OutStream *os, register ullong i)
343
- {
344
- if (os->buf.pos > VINT_END) {
345
- while (i > 127) {
346
- os_write_byte(os, (uchar)((i & 0x7f) | 0x80));
347
- i >>= 7;
348
- }
349
- os_write_byte(os, (uchar)(i));
350
- } else {
351
- while (i > 127) {
352
- write_byte(os, (uchar)((i & 0x7f) | 0x80));
353
- i >>= 7;
354
- }
355
- write_byte(os, (uchar)(i));
356
- }
357
- }
358
-
359
- void
360
- os_write_chars(OutStream *os, char *buf, int start, int length)
361
- {
362
- int i;
363
-
364
- for (i = start; i < start + length; i++) {
365
- os_write_byte(os, buf[i]);
366
- }
367
- }
368
-
369
- void
370
- os_write_string(OutStream *os, char *str)
371
- {
372
- int len = (int)strlen(str);
373
- os_write_vint(os, len);
374
-
375
- os_write_chars(os, str, 0, len);
376
- }
377
-
378
- int file_is_lock(char *filename)
379
- {
380
- int start = (int)strlen(filename) - 4;
381
- return ((start > 0) && (strcmp(".lck", &filename[start]) == 0));
382
- }
data/ext/index_rw.c DELETED
@@ -1,2658 +0,0 @@
1
- #include "index.h"
2
- #include <stdlib.h>
3
- #include <string.h>
4
- #include <array.h>
5
- static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
6
- static char * const WRITE_LOCK_ERROR_MSG = "Could not obtain write lock when trying to write index";
7
- static char * const COMMIT_LOCK_ERROR_MSG = "Could not obtain commit lock when trying to write index";
8
- static char * const DELETED_DOC_ERROR_MSG = "Tried to get doc that has already been deleted";
9
- static char * const INVALID_FIELD_TYPE_MSG = "Invalid field-type";
10
- static char * const DOC_ORDER_ERROR_MSG = "docs out of order curent";
11
- static char * const STALE_READER_ERROR_MSG = "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations";
12
-
13
- const char *INDEX_EXTENSIONS[] = {
14
- "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
15
- "tvx", "tvd", "tvf", "tvp"
16
- };
17
-
18
- const char *COMPOUND_EXTENSIONS[] = {
19
- "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
20
- };
21
-
22
- const char *VECTOR_EXTENSIONS[] = {
23
- "tvx", "tvd", "tvf"
24
- };
25
-
26
- FerretConfig config = {
27
- 10, /* default merge_factor */
28
- 10, /* default min_merge_docs */
29
- INT_MAX, /* default max_merge_docs */
30
- 10000, /* default max_field_length */
31
- 128 /* default term_index_interval */
32
- };
33
-
34
- /***************************************************************************
35
- *
36
- * CacheObject
37
- *
38
- ***************************************************************************/
39
-
40
- unsigned int co_hash(const void *key)
41
- {
42
- return (unsigned int)key;
43
- }
44
-
45
- int co_eq(const void *key1, const void *key2)
46
- {
47
- return (key1 == key2);
48
- }
49
-
50
- void co_destroy(CacheObject *self)
51
- {
52
- h_rem(self->ref_tab1, self->ref2, false);
53
- h_rem(self->ref_tab2, self->ref1, false);
54
- self->destroy(self->obj);
55
- free(self);
56
- }
57
-
58
- CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
59
- void *ref1, void *ref2, free_ft destroy, void *obj)
60
- {
61
- CacheObject *self = ALLOC(CacheObject);
62
- h_set(ref_tab1, ref2, self);
63
- h_set(ref_tab2, ref1, self);
64
- self->ref_tab1 = ref_tab1;
65
- self->ref_tab2 = ref_tab2;
66
- self->ref1 = ref1;
67
- self->ref2 = ref2;
68
- self->destroy = destroy;
69
- self->obj = obj;
70
- return self;
71
- }
72
-
73
- HshTable *co_hsh_create()
74
- {
75
- return h_new(&co_hash, &co_eq, (free_ft)NULL, (free_ft)&co_destroy);
76
- }
77
-
78
- /***************************************************************************
79
- *
80
- * Posting
81
- *
82
- ***************************************************************************/
83
-
84
- Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
85
- {
86
- Posting *self = ALLOC(Posting);
87
- self->freq = 1;
88
- self->size = 1;
89
- self->term = term;
90
- self->positions = ALLOC(int);
91
- self->positions[0] = position;
92
- self->offsets = ALLOC(TVOffsetInfo *);
93
- self->offsets[0] = offset;
94
- return self;
95
- }
96
-
97
- void p_destroy(Posting *self)
98
- {
99
- /* the positions and offsets will be put in a TVTerm so no need to free */
100
- int i;
101
- free(self->positions);
102
- for (i = 0; i < self->freq; i++)
103
- tvoi_destroy(self->offsets[i]);
104
- free(self->offsets);
105
- free(self);
106
- }
107
-
108
- void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset)
109
- {
110
- if (self->freq >= self->size) {
111
- self->size *= 2;
112
- REALLOC_N(self->positions, int, self->size);
113
- REALLOC_N(self->offsets, TVOffsetInfo *, self->size);
114
- }
115
- self->positions[self->freq] = position;
116
- self->offsets[self->freq] = offset;
117
- self->freq++;
118
- }
119
-
120
- inline int p_cmp(const void *const p1, const void *const p2)
121
- {
122
- Term *t1 = (*(Posting **)p1)->term;
123
- Term *t2 = (*(Posting **)p2)->term;
124
- int res = strcmp(t1->field, t2->field);
125
- if (res != 0) {
126
- return res;
127
- } else {
128
- return strcmp(t1->text, t2->text);
129
- }
130
- }
131
-
132
- DocumentWriter *dw_open(Store *store,
133
- Analyzer *analyzer,
134
- Similarity *similarity,
135
- int max_field_length,
136
- int term_index_interval)
137
- {
138
- DocumentWriter *self = ALLOC(DocumentWriter);
139
- self->store = store;
140
- self->analyzer = analyzer;
141
- self->similarity = similarity;
142
- self->fis = NULL;
143
- self->postingtable = h_new(&term_hash, &term_eq,
144
- (free_ft)&term_destroy,
145
- (free_ft)&p_destroy);
146
- self->max_field_length = max_field_length;
147
- self->term_index_interval = term_index_interval;
148
- return self;
149
- }
150
-
151
- void dw_close(DocumentWriter *self)
152
- {
153
- if (self->fis) fis_destroy(self->fis);
154
- h_destroy(self->postingtable);
155
- free(self);
156
- }
157
-
158
- void dw_add_position(DocumentWriter *self, char *field, char *text,
159
- int position, TVOffsetInfo *offset)
160
- {
161
- Term termbuf = {field, text}, *term;
162
- Posting *p = (Posting *)h_get(self->postingtable, &termbuf);
163
-
164
- if (p) { /* word seen before */
165
- if (p->freq >= p->size) {
166
- /* double size of posting to make room for more posts. */
167
- p->size <<= 1;
168
- REALLOC_N(p->positions, int, p->size);
169
- p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
170
- }
171
- p->positions[p->freq] = position; /* add new position */
172
- p->offsets[p->freq] = offset; /* add new offset */
173
- p->freq++; /* update frequency */
174
- } else { /* word not seen before */
175
- term = term_create(field, text);
176
- h_set(self->postingtable, term, p_create(term, position, offset));
177
- }
178
- }
179
-
180
- void dw_invert_doc(DocumentWriter *self, Document *doc)
181
- {
182
- int i;
183
- int dfcnt = doc->dfcnt;
184
- char *field_name, *text;
185
- int field_number, length, position, offset, slen;
186
- TokenStream *stream;
187
- Token *token;
188
- FieldInfo *fi;
189
- char text_buf[MAX_WORD_SIZE];
190
- text_buf[MAX_WORD_SIZE - 1] = '\0';
191
-
192
- DocField **fields = doc->df_arr, *field;
193
- for (i = 0; i < dfcnt; i++) {
194
- field = fields[i];
195
- field_name = field->name;
196
- fi = ((FieldInfo *)ht_get(self->fis->by_name, field_name));
197
- field_number = fi->number;
198
-
199
- length = self->field_lengths[field_number];
200
- offset = self->field_offsets[field_number];
201
- position = self->field_positions[field_number];
202
-
203
- if (fi->is_indexed) {
204
- if (!field->is_tokenized) { /* un-tokenized field */
205
- text = field->data;
206
- slen = (int)strlen(text);
207
- if (slen >= MAX_WORD_SIZE) {
208
- slen = MAX_WORD_SIZE - 1;
209
- text = strncpy(text_buf, text, MAX_WORD_SIZE - 1);
210
- }
211
- if (fi->store_offset) {
212
- dw_add_position(self, field_name, text, position,
213
- tvoi_create(offset, offset+slen));
214
- } else {
215
- dw_add_position(self, field_name, text, position, NULL);
216
- }
217
- offset += slen;
218
- length++;
219
- } else {
220
-
221
- /* Tokenize field and add to posting_table */
222
- stream = a_get_ts(self->analyzer, field_name, field->data);
223
-
224
- while ((token = ts_next(stream)) != NULL) {
225
- position += (token->pos_inc - 1);
226
-
227
- if (fi->store_offset) {
228
- dw_add_position(self,
229
- field_name,
230
- token->text,
231
- position,
232
- tvoi_create(offset + token->start, offset + token->end));
233
- position++;
234
- } else {
235
- dw_add_position(self, field_name, token->text, position, NULL);
236
- position++;
237
- }
238
-
239
- length++;
240
- /* stop if we reach the max field length */
241
- if (length > self->max_field_length) {
242
- break;
243
- }
244
- }
245
-
246
- if (token) {
247
- offset += token->end + 1;
248
- }
249
- }
250
- self->field_lengths[field_number] = length;
251
- self->field_offsets[field_number] = offset;
252
- self->field_positions[field_number] = position;
253
- self->field_boosts[field_number] *= field->boost;
254
- }
255
- }
256
- }
257
-
258
- Posting **dw_sort_posting_table(DocumentWriter *self)
259
- {
260
- HshTable *ht = self->postingtable;
261
- HshEntry *he = ht->table;
262
- Posting **postings;
263
- int i;
264
-
265
- self->pcnt = i = ht->used;
266
- postings = ALLOC_N(Posting *, i);
267
-
268
- while (i > 0) {
269
- if (he->value != NULL) {
270
- i--;
271
- postings[i] = (Posting *)he->value;
272
- }
273
- he++;
274
- }
275
- qsort(postings, self->pcnt, sizeof(Posting *), &p_cmp);
276
- return postings;
277
- }
278
-
279
- void dw_write_postings(DocumentWriter *self, Posting **postings, char *segment)
280
- {
281
- OutStream * volatile freq_out = NULL, * volatile prox_out = NULL;
282
- TermInfosWriter * volatile tiw = NULL;
283
- TermVectorsWriter * volatile tvw = NULL;
284
- Store *store = self->store;
285
- TermInfo * volatile ti = NULL;
286
- Posting *posting;
287
- int i, j, posting_freq, position, last_position;
288
- char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
289
- strcpy(fname, segment);
290
-
291
- TRY
292
- /* open files for inverse index storage */
293
- sprintf(fname, "%s.frq", segment);
294
- freq_out = store->create_output(store, fname);
295
- sprintf(fname, "%s.prx", segment);
296
- prox_out = store->create_output(store, fname);
297
- tiw = tiw_open(store, segment, self->fis, self->term_index_interval);
298
- ti = ti_create(0, 0, 0, 0);
299
-
300
- for (i = 0; i < self->pcnt; i++) {
301
- posting = postings[i];
302
-
303
- /* add an entry to dictionary with pointers to prox and freq_out files */
304
- ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
305
- tiw_add(tiw, posting->term, ti);
306
-
307
- /* add an entry to the freq_out file */
308
- posting_freq = posting->freq;
309
- if (posting_freq == 1) { /* optimize freq=1 */
310
- os_write_vint(freq_out, 1); /* set low bit of doc num */
311
- } else {
312
- os_write_vint(freq_out, 0); /* the doc number */
313
- os_write_vint(freq_out, posting_freq); /* frequency in doc */
314
- }
315
-
316
- last_position = 0; /* write positions */
317
-
318
- for (j = 0; j < posting_freq; j++) {
319
- position = posting->positions[j];
320
- os_write_vint(prox_out, position - last_position);
321
- last_position = position;
322
- }
323
-
324
- /* check to see if we switched to a new field */
325
- term_field = posting->term->field;
326
- if (curr_field != term_field) {
327
- FieldInfo *fi;
328
- /* changing field - see if there is something to save */
329
- curr_field = term_field;
330
- fi = (FieldInfo *)ht_get(self->fis->by_name, curr_field);
331
- if (fi->store_tv) {
332
- if (tvw == NULL) {
333
- tvw = tvw_open(store, segment, self->fis);
334
- tvw_open_doc(tvw);
335
- }
336
- tvw_open_field(tvw, curr_field);
337
-
338
- } else if (tvw != NULL) {
339
- tvw_close_field(tvw);
340
- }
341
- }
342
- /* tvw->curr_field != NULL implies field is still open */
343
- if (tvw != NULL && tvw->curr_field != NULL) {
344
- tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
345
- }
346
- }
347
- XFINALLY
348
- if (tvw) {
349
- tvw_close_doc(tvw);
350
- tvw_close(tvw);
351
- }
352
- /* make an effort to close all streams we can but remember and re-raise
353
- * the last exception encountered in this process */
354
- if (freq_out) os_close(freq_out);
355
- if (prox_out) os_close(prox_out);
356
- if (tiw) tiw_close(tiw);
357
- if (ti) ti_destroy(ti);
358
- XENDTRY
359
- }
360
-
361
- void dw_write_norms(DocumentWriter *self, char *segment)
362
- {
363
- int i;
364
- float norm;
365
- OutStream *norms_out;
366
- char fname[SEGMENT_NAME_MAX_LENGTH];
367
- FieldInfos *fis = self->fis;
368
- FieldInfo *fi;
369
-
370
- for (i = 0; i < fis->fcnt; i++) {
371
- fi = fis->by_number[i];
372
-
373
- if (fi->is_indexed && !fi->omit_norms) {
374
- norm = self->field_boosts[i] *
375
- sim_length_norm(self->similarity, fi->name, self->field_lengths[i]);
376
- sprintf(fname, "%s.f%d", segment, i);
377
- norms_out = self->store->create_output(self->store, fname);
378
- TRY
379
- os_write_byte(norms_out, sim_encode_norm(self->similarity, norm));
380
- XFINALLY
381
- os_close(norms_out);
382
- XENDTRY
383
- }
384
- }
385
- }
386
-
387
- void dw_add_doc(DocumentWriter *self, char *segment, Document *doc)
388
- {
389
- Posting **postings;
390
- FieldsWriter *fw;
391
- int i;
392
-
393
- /* write field names */
394
- self->fis = fis_create();
395
- fis_add_doc(self->fis, doc);
396
- fis_write(self->fis, self->store, segment, ".fnm");
397
-
398
- /* write field values */
399
- fw = fw_open(self->store, segment, self->fis);
400
- TRY
401
- fw_add_doc(fw, doc);
402
- XFINALLY
403
- fw_close(fw);
404
- XENDTRY
405
-
406
- /* invert doc into posting_table */
407
-
408
- h_clear(self->postingtable); /* clear posting_table */
409
-
410
- self->field_boosts = ALLOC_N(float, self->fis->fcnt);
411
- self->field_lengths = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
412
- self->field_offsets = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
413
- self->field_positions = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
414
-
415
- for (i = 0; i < self->fis->fcnt; i++) {
416
- self->field_boosts[i] = doc->boost;
417
- }
418
-
419
- dw_invert_doc(self, doc);
420
-
421
- /* sort posting_table into an array */
422
- postings = dw_sort_posting_table(self);
423
-
424
- /* write postings */
425
- dw_write_postings(self, postings, segment);
426
- free(postings);
427
-
428
- /* write norms of indexed fields */
429
- dw_write_norms(self, segment);
430
-
431
- free(self->field_boosts);
432
- free(self->field_lengths);
433
- free(self->field_offsets);
434
- free(self->field_positions);
435
- }
436
-
437
- /****************************************************************************
438
- *
439
- * SegmentInfo
440
- *
441
- ****************************************************************************/
442
-
443
- SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
444
- {
445
- SegmentInfo *si = ALLOC(SegmentInfo);
446
- si->name = name;
447
- si->doc_cnt = doc_cnt;
448
- si->store = store;
449
- return si;
450
- }
451
-
452
- void si_destroy(SegmentInfo *si)
453
- {
454
- free(si->name);
455
- free(si);
456
- }
457
-
458
- bool si_has_deletions(SegmentInfo *si)
459
- {
460
- char del_file_name[SEGMENT_NAME_MAX_LENGTH];
461
- sprintf(del_file_name, "%s.del", si->name);
462
- return si->store->exists(si->store, del_file_name);
463
- }
464
-
465
- bool si_uses_compound_file(SegmentInfo *si)
466
- {
467
- char compound_file_name[SEGMENT_NAME_MAX_LENGTH];
468
- sprintf(compound_file_name, "%s.cfs", si->name);
469
- return si->store->exists(si->store, compound_file_name);
470
- }
471
-
472
- struct NormTester {
473
- bool has_norm_file;
474
- char *segment_name;
475
- };
476
- void is_norm_file(char *fname, void *arg)
477
- {
478
- struct NormTester *nt = (struct NormTester *)arg;
479
- char norm_file_pattern[SEGMENT_NAME_MAX_LENGTH];
480
- sprintf(norm_file_pattern, "%s.s", nt->segment_name);
481
- if (strncmp(fname, norm_file_pattern, strlen(norm_file_pattern)) == 0) {
482
- nt->has_norm_file = true;
483
- }
484
- }
485
-
486
- bool si_has_separate_norms(SegmentInfo *si)
487
- {
488
- struct NormTester nt;
489
- nt.segment_name = si->name;
490
- nt.has_norm_file = false;
491
- si->store->each(si->store, &is_norm_file, &nt);
492
-
493
- return nt.has_norm_file;
494
- }
495
-
496
-
497
- /****************************************************************************
498
- *
499
- * SegmentInfos
500
- *
501
- ****************************************************************************/
502
-
503
- #include <time.h>
504
- #define FORMAT -1
505
- #define SEGMENT_FILENAME "segments"
506
- #define TEMPORARY_SEGMENT_FILENAME "segments.new"
507
-
508
- SegmentInfos *sis_create()
509
- {
510
- SegmentInfos *sis = ALLOC(SegmentInfos);
511
- sis->format = FORMAT;
512
- sis->version = (unsigned int)time(NULL);
513
- sis->scnt = 0;
514
- sis->counter = 0;
515
- sis->size = 4;
516
- sis->segs = ALLOC_N(SegmentInfo *, sis->size);
517
- return sis;
518
- }
519
-
520
- void sis_destroy_not_infos(SegmentInfos *sis)
521
- {
522
- free(sis->segs);
523
- free(sis);
524
- }
525
-
526
- void sis_destroy(SegmentInfos *sis)
527
- {
528
- int i;
529
- for (i = 0; i < sis->scnt; i++)
530
- si_destroy(sis->segs[i]);
531
- free(sis->segs);
532
- free(sis);
533
- }
534
-
535
- void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
536
- {
537
- if (sis->scnt >= sis->size) {
538
- sis->size = sis->scnt * 2;
539
- REALLOC_N(sis->segs, SegmentInfo *, sis->size);
540
- }
541
- sis->segs[sis->scnt] = si;
542
- sis->scnt++;
543
- }
544
-
545
- void sis_del_at(SegmentInfos *sis, int at)
546
- {
547
- int i;
548
- si_destroy(sis->segs[at]);
549
- sis->scnt--;
550
- for (i = at; i < sis->scnt; i++) {
551
- sis->segs[i] = sis->segs[i+1];
552
- }
553
- }
554
-
555
- void sis_del_from_to(SegmentInfos *sis, int from, int to)
556
- {
557
- int i, num_to_del = to - from;
558
- sis->scnt -= num_to_del;
559
- for (i = from; i < to; i++) {
560
- si_destroy(sis->segs[i]);
561
- }
562
- for (i = from; i < sis->scnt; i++) {
563
- sis->segs[i] = sis->segs[i+num_to_del];
564
- }
565
- }
566
-
567
- void sis_clear(SegmentInfos *sis)
568
- {
569
- int i;
570
- for (i = 0; i < sis->scnt; i++) {
571
- si_destroy(sis->segs[i]);
572
- }
573
- sis->scnt = 0;
574
- }
575
-
576
- void sis_read(SegmentInfos *sis, Store *store)
577
- {
578
- int doc_cnt;
579
- int seg_count;
580
- int i;
581
- char *name;
582
- InStream *is = store->open_input(store, SEGMENT_FILENAME);
583
-
584
- TRY
585
-
586
- sis->format = is_read_int(is);
587
- if (sis->format < 0) { /* file contains explicit format info */
588
- /* check that it is a format we can understand */
589
- if (sis->format < FORMAT)
590
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
591
- sis->version = (uint)is_read_long(is);
592
- sis->counter = (int)is_read_int(is);
593
- } else { /* file is in old format without explicit format info */
594
- sis->counter = sis->format;
595
- }
596
-
597
- seg_count = is_read_int(is);
598
- for (i = 0; i < seg_count; i++) {
599
- name = is_read_string(is);
600
- doc_cnt = is_read_int(is);
601
- sis_add_si(sis, si_create(name, doc_cnt, store));
602
- }
603
-
604
- if (sis->format >= 0) {
605
- /* in old format the version number may be at the end of the file */
606
- if (is_pos(is) >= is_length(is)) {
607
- sis->version = 0; /* old file format without version number */
608
- } else {
609
- sis->version = (int)is_read_long(is); /* read version */
610
- }
611
- }
612
- XFINALLY
613
- is_close(is);
614
- XENDTRY
615
- }
616
-
617
- void sis_write(SegmentInfos *sis, Store *store)
618
- {
619
- int i;
620
- SegmentInfo *si;
621
- OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
622
- TRY
623
- os_write_int(os, FORMAT);
624
- os_write_long(os, ++(sis->version)); /* every write changes the index */
625
- os_write_int(os, sis->counter);
626
- os_write_int(os, sis->scnt);
627
- for (i = 0; i < sis->scnt; i++) {
628
- si = sis->segs[i];
629
- os_write_string(os, si->name);
630
- os_write_int(os, si->doc_cnt);
631
- }
632
-
633
- XFINALLY
634
- os_close(os);
635
- XENDTRY
636
-
637
- /* install new segment info */
638
- store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
639
- }
640
-
641
- int sis_read_current_version(Store *store)
642
- {
643
- InStream *is;
644
- SegmentInfos *sis;
645
- int format = 0;
646
- int version = 0;
647
-
648
- if (!store->exists(store, SEGMENT_FILENAME))
649
- return 0;
650
- is = store->open_input(store, SEGMENT_FILENAME);
651
-
652
- TRY
653
- format = is_read_int(is);
654
- if (format < 0) {
655
- if (format < FORMAT)
656
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
657
- version = (int)is_read_long(is);
658
- }
659
- XFINALLY
660
- is_close(is);
661
- XENDTRY
662
-
663
- if (format < 0)
664
- return version;
665
-
666
- /* We cannot be sure about the format of the file.
667
- * Therefore we have to read the whole file and cannot simply
668
- * seek to the version entry. */
669
-
670
- sis = sis_create();
671
- sis_read(sis, store);
672
- version = sis->version;
673
- sis_destroy(sis);
674
- return version;
675
- }
676
-
677
- /****************************************************************************
678
- *
679
- * IndexWriter
680
- *
681
- ****************************************************************************/
682
-
683
- /**
684
- * Deletes the analyzer by default but leaves the store by default
685
- */
686
- IndexWriter *iw_open(Store *store, Analyzer *analyzer, bool create)
687
- {
688
- IndexWriter *iw = ALLOC(IndexWriter);
689
- if (create)
690
- store->clear_all(store);
691
- mutex_init(&iw->mutex, NULL);
692
- iw->merge_factor = config.merge_factor;
693
- iw->min_merge_docs = config.min_merge_docs;
694
- iw->max_merge_docs = config.max_merge_docs;
695
- iw->max_field_length = config.max_field_length;
696
- iw->term_index_interval = config.term_index_interval;
697
- iw->use_compound_file = true;
698
- iw->store = store;
699
- ref(store);
700
- iw->analyzer = analyzer;
701
- iw->sis = sis_create();
702
- iw->similarity = sim_create_default();
703
- iw->ram_store = open_ram_store();
704
-
705
- mutex_lock(&store->mutex);
706
- /* keep the write_lock obtained until the IndexWriter is closed. */
707
- iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
708
- if (!iw->write_lock->obtain(iw->write_lock)) {
709
- RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
710
- }
711
-
712
- if (create) {
713
- Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
714
- if (!commit_lock->obtain(commit_lock)) {
715
- store->close_lock(commit_lock);
716
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
717
- }
718
- TRY
719
- /* commit the index */
720
- store->clear(store);
721
- sis_write(iw->sis, store);
722
- XFINALLY
723
- commit_lock->release(commit_lock);
724
- store->close_lock(commit_lock);
725
- XENDTRY
726
- } else {
727
- sis_read(iw->sis, store);
728
- }
729
- mutex_unlock(&store->mutex);
730
- return iw;
731
- }
732
-
733
- const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
734
-
735
- char *new_segment_name(int counter)
736
- {
737
- char buf[SEGMENT_NAME_MAX_LENGTH];
738
- int i;
739
-
740
- buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
741
- for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
742
- buf[i] = base36_digitmap[counter%36];
743
- counter /= 36;
744
- if (counter == 0) break;
745
- }
746
- i--;
747
- buf[i] = '_';
748
- return estrdup(&buf[i]);
749
- }
750
-
751
- int iw_doc_count(IndexWriter *iw)
752
- {
753
- int i, doc_cnt = 0;
754
- mutex_lock(&iw->mutex);
755
- for (i = 0; i < iw->sis->scnt; i++)
756
- doc_cnt += iw->sis->segs[i]->doc_cnt;
757
- mutex_unlock(&iw->mutex);
758
- return doc_cnt;
759
- }
760
-
761
- void delete_files(Array *file_names, Store *store)
762
- {
763
- int i;
764
- for (i = 0; i < file_names->size; i++) {
765
- store->remove(store, (char *)file_names->elems[i]);
766
- }
767
- ary_destroy(file_names);
768
- }
769
-
770
-
771
- Array *sr_file_names(IndexReader *ir);
772
- void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
773
- {
774
- /* The java version keeps a record of files that it couldn't delete. This
775
- * shouldn't be a problem on linux I hope. */
776
- IndexReader *ir;
777
- int i;
778
- for (i = 0; i < del_cnt; i++) {
779
- ir = segment_readers[i];
780
- delete_files(sr_file_names(ir), ir->store);
781
- }
782
- }
783
-
784
- void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
785
- {
786
- Array *files_to_delete;
787
- Lock *commit_lock;
788
- char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
789
-
790
- mutex_lock(&iw->store->mutex);
791
- sprintf(merged_tmp, "%s.tmp", merged_name);
792
- sprintf(merged_cfs, "%s.cfs", merged_name);
793
-
794
- files_to_delete = sm_create_compound_file(merger, merged_tmp);
795
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
796
-
797
- if (!commit_lock->obtain(commit_lock)) {
798
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
799
- }
800
-
801
- /* make compound file visible for SegmentReaders */
802
- iw->store->rename(iw->store, merged_tmp, merged_cfs);
803
-
804
- /* delete now unused files of segment */
805
- delete_files(files_to_delete, iw->store);
806
-
807
- commit_lock->release(commit_lock);
808
- iw->store->close_lock(commit_lock);
809
- mutex_unlock(&iw->store->mutex);
810
- }
811
-
812
- void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
813
- {
814
- int i;
815
- int merged_doc_count;
816
- Lock *commit_lock;
817
- IndexReader **segments_to_delete = ALLOC_N(IndexReader *, max_segment - min_segment);
818
- int del_cnt = 0;
819
-
820
- char *merged_name = new_segment_name(iw->sis->counter++);
821
-
822
- SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
823
- IndexReader *reader;
824
-
825
-
826
- for (i = min_segment; i < max_segment; i++) {
827
- reader = sr_open(iw->sis, i, false);
828
- sm_add(merger, reader);
829
- if ((reader->store == iw->store) || /* if we own the directory */
830
- (reader->store == iw->ram_store)) {
831
- segments_to_delete[del_cnt++] = reader; /* queue segment for deletion */
832
- }
833
- }
834
-
835
- merged_doc_count = sm_merge(merger);
836
-
837
- sis_del_from_to(iw->sis, min_segment, max_segment);
838
-
839
- sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
840
-
841
- /* close readers before we attempt to delete now-obsolete segments */
842
-
843
- mutex_lock(&iw->store->mutex);
844
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
845
- if (!commit_lock->obtain(commit_lock)) {
846
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
847
- }
848
- /* commit the index */
849
- sis_write(iw->sis, iw->store);
850
- iw_delete_segments(iw, segments_to_delete, del_cnt);
851
-
852
- commit_lock->release(commit_lock);
853
- iw->store->close_lock(commit_lock);
854
- mutex_unlock(&iw->store->mutex);
855
-
856
- if (iw->use_compound_file) {
857
- make_compound_file(iw, merged_name, merger);
858
- }
859
-
860
- free(segments_to_delete);
861
- sm_destroy(merger);
862
- }
863
-
864
- void iw_merge_segments(IndexWriter *iw, int min_segment)
865
- {
866
- iw_merge_segments_with_max(iw, min_segment, iw->sis->scnt);
867
- }
868
-
869
- void iw_maybe_merge_segments(IndexWriter *iw)
870
- {
871
- int target_merge_docs = iw->min_merge_docs;
872
- int min_segment, merge_docs;
873
- SegmentInfo *si;
874
-
875
- while (target_merge_docs <= iw->max_merge_docs) {
876
- /* find segments smaller than current target size */
877
- min_segment = iw->sis->scnt - 1;
878
- merge_docs = 0;
879
- while (min_segment >= 0) {
880
- si = iw->sis->segs[min_segment];
881
- if (si->doc_cnt >= target_merge_docs) {
882
- break;
883
- }
884
- merge_docs += si->doc_cnt;
885
- min_segment -= 1;
886
- }
887
-
888
- if (merge_docs >= target_merge_docs) { /* found a merge to do */
889
- iw_merge_segments(iw, min_segment + 1);
890
- } else {
891
- break;
892
- }
893
-
894
- target_merge_docs *= iw->merge_factor; /* increase target size */
895
- }
896
- }
897
-
898
- void iw_flush_ram_segments(IndexWriter *iw)
899
- {
900
- int min_segment = iw->sis->scnt-1;
901
- int doc_count = 0;
902
- SegmentInfo **segs = iw->sis->segs;
903
- while ((min_segment >= 0) &&
904
- (segs[min_segment]->store == iw->ram_store)) {
905
- doc_count += segs[min_segment]->doc_cnt;
906
- min_segment--;
907
- }
908
- /* the following if statement is actually incrementing for different
909
- * reasons. If min_segment < 0 then we must increment as we searched
910
- * off the end. If the top segment is not ram_store there are no
911
- * ram segments to flush so we increment so the next check will return
912
- * us from this function. Lastly, the min_segment stopped at a segment
913
- * that wasn't the ram segment. But if it fit's in with the merge
914
- * factor, why not merge it. Otherwise we leave it and increment min_seg
915
- */
916
- if ((min_segment < 0) || /* add one FS segment? */
917
- ((doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor) ||
918
- (segs[iw->sis->scnt - 1]->store != iw->ram_store)) {
919
- min_segment++;
920
- }
921
- if (min_segment >= iw->sis->scnt) {
922
- return;
923
- }
924
- iw_merge_segments(iw, min_segment);
925
- }
926
-
927
- void iw_add_doc(IndexWriter *iw, Document *doc)
928
- {
929
- DocumentWriter *dw;
930
- char *segment_name;
931
-
932
- mutex_lock(&iw->mutex);
933
- dw = dw_open(iw->ram_store,
934
- iw->analyzer,
935
- iw->similarity,
936
- iw->max_field_length,
937
- iw->term_index_interval);
938
- segment_name = new_segment_name(iw->sis->counter++);
939
- dw_add_doc(dw, segment_name, doc);
940
- dw_close(dw);
941
- sis_add_si(iw->sis, si_create(segment_name, 1, iw->ram_store));
942
- iw_maybe_merge_segments(iw);
943
- mutex_unlock(&iw->mutex);
944
- }
945
-
946
- static inline void iw_optimize_internal(IndexWriter *iw)
947
- {
948
- int min_segment;
949
- iw_flush_ram_segments(iw);
950
- while (iw->sis->scnt > 1 ||
951
- (iw->sis->scnt == 1 &&
952
- ( si_has_deletions(iw->sis->segs[0]) ||
953
- (iw->sis->segs[0]->store != iw->store) ||
954
- (iw->use_compound_file &&
955
- (!si_uses_compound_file(iw->sis->segs[0]) ||
956
- si_has_separate_norms(iw->sis->segs[0])))))) {
957
- min_segment = iw->sis->scnt - iw->merge_factor;
958
- iw_merge_segments(iw, min_segment < 0 ? 0 : min_segment);
959
- }
960
- }
961
- void iw_optimize(IndexWriter *iw)
962
- {
963
- mutex_lock(&iw->mutex);
964
- iw_optimize_internal(iw);
965
- mutex_unlock(&iw->mutex);
966
- }
967
-
968
- void iw_close(IndexWriter *iw)
969
- {
970
- mutex_lock(&iw->mutex);
971
- iw_flush_ram_segments(iw);
972
- store_deref(iw->ram_store);
973
- sis_destroy(iw->sis);
974
-
975
- sim_destroy(iw->similarity);
976
- a_deref(iw->analyzer);
977
-
978
- iw->write_lock->release(iw->write_lock);
979
- iw->store->close_lock(iw->write_lock);
980
-
981
- store_deref(iw->store);
982
- mutex_destroy(&iw->mutex);
983
- free(iw);
984
- }
985
-
986
- void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
987
- {
988
- int i, j, end, start;
989
-
990
- mutex_lock(&iw->mutex);
991
- iw_optimize_internal(iw); /* start with zero or 1 seg */
992
-
993
- start = iw->sis->scnt;
994
-
995
- for (i = 0; i < cnt; i++) {
996
- Store *store = stores[i];
997
- SegmentInfos *sis = sis_create(); /* read infos from dir */
998
- sis_read(sis, store);
999
-
1000
- for (j = 0; j < sis->scnt; j++) {
1001
- SegmentInfo *si = sis->segs[j];
1002
- sis_add_si(iw->sis, si);
1003
- }
1004
- sis_destroy_not_infos(sis);
1005
- }
1006
-
1007
- /* merge newly added segments in log(n) passes */
1008
- while (iw->sis->scnt > start + iw->merge_factor) {
1009
- for (i = start + 1; i < iw->sis->scnt; i++) {
1010
- end = MIN(iw->sis->scnt, i + iw->merge_factor);
1011
- if (end - i > 1) {
1012
- iw_merge_segments_with_max(iw, i, end);
1013
- }
1014
- }
1015
- }
1016
-
1017
- /* final cleanup */
1018
- iw_optimize_internal(iw);
1019
- mutex_unlock(&iw->mutex);
1020
- }
1021
-
1022
-
1023
- /**
1024
- * This adds an array of readers to the index leaving the added readers open.
1025
- */
1026
- void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
1027
- {
1028
- IndexReader *ir = NULL;
1029
- int i, del_cnt = 0;
1030
- int doc_count;
1031
- char *merged_name;
1032
- SegmentMerger *merger;
1033
- Lock *commit_lock;
1034
-
1035
- mutex_lock(&iw->mutex);
1036
- iw_optimize_internal(iw); /* start with zero or 1 seg */
1037
-
1038
- merged_name = new_segment_name(iw->sis->counter++);
1039
-
1040
- merger = sm_create(iw->store, merged_name, iw->term_index_interval);
1041
- merger->readers->free_elem = NULL; /* don't close readers */
1042
-
1043
- if (iw->sis->scnt == 1) { /* add existing index, if any */
1044
- ir = sr_open_si(iw->sis->segs[0]);
1045
- sm_add(merger, ir);
1046
- del_cnt = 1;
1047
- }
1048
-
1049
- for (i = 0; i < cnt; i++) {
1050
- sm_add(merger, irs[i]);
1051
- }
1052
-
1053
- doc_count = sm_merge(merger); /* merge 'em */
1054
-
1055
- /* pop old infos and add new ones. */
1056
- sis_clear(iw->sis);
1057
- sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
1058
-
1059
-
1060
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
1061
- if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
1062
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
1063
- }
1064
-
1065
- sis_write(iw->sis, iw->store); /* commit changes */
1066
- iw_delete_segments(iw, &ir, del_cnt);
1067
- if (ir) ir_close(ir);
1068
-
1069
- commit_lock->release(commit_lock);
1070
- iw->store->close_lock(commit_lock);
1071
-
1072
- if (iw->use_compound_file) {
1073
- make_compound_file(iw, merged_name, merger);
1074
- }
1075
-
1076
- iw_optimize_internal(iw);
1077
- sm_destroy(merger);
1078
-
1079
- mutex_unlock(&iw->mutex);
1080
- }
1081
-
1082
- /****************************************************************************
1083
- *
1084
- * Norm
1085
- *
1086
- ****************************************************************************/
1087
-
1088
- Norm *norm_create(InStream *is, int field_num)
1089
- {
1090
- Norm *norm = ALLOC(Norm);
1091
- norm->is = is;
1092
- norm->field_num = field_num;
1093
- norm->bytes = NULL;
1094
- norm->is_dirty = false;
1095
- return norm;
1096
- }
1097
-
1098
- void norm_destroy(Norm *norm)
1099
- {
1100
- is_close(norm->is);
1101
- if (norm->bytes != NULL) {
1102
- free(norm->bytes);
1103
- }
1104
- free(norm);
1105
- }
1106
-
1107
- void norm_rewrite(Norm *norm, Store *store, char *segment,
1108
- int doc_count, Store *cfs_store)
1109
- {
1110
- OutStream *os;
1111
- char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1112
- char norm_fname[SEGMENT_NAME_MAX_LENGTH];
1113
-
1114
- if (norm->bytes == NULL) {
1115
- return; /* These norms do not need to be rewritten */
1116
- }
1117
-
1118
- sprintf(tmp_fname, "%s.tmp", segment);
1119
- os = store->create_output(store, tmp_fname);
1120
- TRY
1121
- os_write_bytes(os, norm->bytes, doc_count);
1122
- XFINALLY
1123
- os_close(os);
1124
- XENDTRY
1125
- if (cfs_store) {
1126
- sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
1127
- } else {
1128
- sprintf(norm_fname, "%s.f%d", segment, norm->field_num);
1129
- }
1130
- store->rename(store, tmp_fname, norm_fname);
1131
- norm->is_dirty = false;
1132
- }
1133
-
1134
- /****************************************************************************
1135
- *
1136
- * SegmentReader
1137
- *
1138
- ****************************************************************************/
1139
-
1140
- #define GET_SR SegmentReader *sr = (SegmentReader *)ir->data
1141
-
1142
- int sr_max_doc(IndexReader *ir)
1143
- {
1144
- return ((SegmentReader *)ir->data)->fr->len;
1145
- }
1146
-
1147
- static inline void sr_close_norms(SegmentReader *sr)
1148
- {
1149
- h_destroy(sr->norms);
1150
- }
1151
-
1152
- static inline TermVectorsReader *sr_tvr(SegmentReader *sr)
1153
- {
1154
- TermVectorsReader *tvr;
1155
- if ((tvr = thread_getspecific(sr->thread_tvr)) == NULL) {
1156
- tvr = tvr_clone(sr->orig_tvr);
1157
- if (tvr == NULL) printf("scuk\n");
1158
- ary_append(sr->tvr_bucket, tvr);
1159
- thread_setspecific(sr->thread_tvr, tvr);
1160
- }
1161
- return tvr;
1162
- }
1163
-
1164
- void sr_close(IndexReader *ir)
1165
- {
1166
- GET_SR;
1167
- fr_close(sr->fr);
1168
- tir_close(sr->tir);
1169
-
1170
- if (sr->freq_in) is_close(sr->freq_in);
1171
- if (sr->prox_in) is_close(sr->prox_in);
1172
-
1173
- fis_destroy(sr->fis);
1174
- sr_close_norms(sr);
1175
-
1176
- if (sr->orig_tvr) {
1177
- tvr_close(sr->orig_tvr);
1178
- thread_key_delete(sr->thread_tvr);
1179
- ary_destroy(sr->tvr_bucket);
1180
- }
1181
- if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
1182
- if (sr->cfs_store) store_deref(sr->cfs_store);
1183
- if (sr->fake_norms) free(sr->fake_norms);
1184
- free(sr->segment);
1185
- free(sr);
1186
- }
1187
-
1188
- void sr_delete_doc(IndexReader *ir, int doc_num)
1189
- {
1190
- GET_SR;
1191
- if (sr->deleted_docs == NULL)
1192
- sr->deleted_docs = bv_create();
1193
-
1194
- sr->deleted_docs_dirty = true;
1195
- sr->undelete_all = false;
1196
- bv_set(sr->deleted_docs, doc_num);
1197
- }
1198
-
1199
- static inline bool sr_is_deleted_internal(IndexReader *ir, int doc_num)
1200
- {
1201
- GET_SR;
1202
- return (sr->deleted_docs != NULL && bv_get(sr->deleted_docs, doc_num));
1203
- }
1204
-
1205
- bool sr_is_deleted(IndexReader *ir, int doc_num)
1206
- {
1207
- bool is_del;
1208
-
1209
- mutex_lock(&ir->mutex);
1210
- is_del = sr_is_deleted_internal(ir, doc_num);
1211
- mutex_unlock(&ir->mutex);
1212
-
1213
- return is_del;
1214
- }
1215
-
1216
- bool sr_has_norms(IndexReader *ir, char *field)
1217
- {
1218
- GET_SR;
1219
- bool has_norms;
1220
- mutex_lock(&ir->mutex);
1221
- has_norms = h_has_key(sr->norms, field);
1222
- mutex_unlock(&ir->mutex);
1223
-
1224
- return has_norms;
1225
- }
1226
-
1227
- bool sr_has_deletions(IndexReader *ir)
1228
- {
1229
- GET_SR;
1230
- return (sr->deleted_docs != NULL);
1231
- }
1232
-
1233
- void sr_undelete_all(IndexReader *ir)
1234
- {
1235
- GET_SR;
1236
- sr->undelete_all = true;
1237
- sr->deleted_docs_dirty = false;
1238
- if (sr->deleted_docs != NULL) bv_destroy(sr->deleted_docs);
1239
- sr->deleted_docs = NULL;
1240
- }
1241
-
1242
- TermEnum *sr_terms(IndexReader *ir)
1243
- {
1244
- TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1245
- return te->clone(te);
1246
- }
1247
-
1248
- TermEnum *sr_terms_from(IndexReader *ir, Term *term)
1249
- {
1250
- TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1251
- TermEnum *ret_te = te->clone(te);
1252
- te_skip_to(ret_te, term);
1253
- return ret_te;
1254
- }
1255
-
1256
- Document *sr_get_doc(IndexReader *ir, int doc_num)
1257
- {
1258
- GET_SR;
1259
- Document *doc;
1260
- mutex_lock(&ir->mutex);
1261
- if (sr_is_deleted_internal(ir, doc_num)) {
1262
- mutex_unlock(&ir->mutex);
1263
- RAISE(STATE_ERROR, DELETED_DOC_ERROR_MSG);
1264
- }
1265
- doc = fr_get_doc(sr->fr, doc_num);
1266
- mutex_unlock(&ir->mutex);
1267
- return doc;
1268
- }
1269
-
1270
- static inline void
1271
- sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
1272
- {
1273
- GET_SR;
1274
- Norm *norm = h_get(sr->norms, field);
1275
- if (norm == NULL) {
1276
- memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
1277
- } else if (norm->bytes != NULL) { /* can copy from cache */
1278
- memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
1279
- } else {
1280
- InStream *norm_in = is_clone(norm->is);
1281
- /* read from disk */
1282
- is_seek(norm_in, 0);
1283
- is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
1284
- is_close(norm_in);
1285
- }
1286
- }
1287
-
1288
- void sr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1289
- {
1290
- mutex_lock(&ir->mutex);
1291
- sr_get_norms_into_internal(ir, field, buf, offset);
1292
- mutex_unlock(&ir->mutex);
1293
- }
1294
-
1295
- static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
1296
- {
1297
- GET_SR;
1298
- Norm *norm = h_get(sr->norms, field);
1299
- if (norm == NULL) { /* not an indexed field */
1300
- return NULL;
1301
- }
1302
-
1303
- if (norm->bytes == NULL) { /* value not yet read */
1304
- uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
1305
- sr_get_norms_into_internal(ir, field, bytes, 0);
1306
- norm->bytes = bytes; /* cache it */
1307
- }
1308
- return norm->bytes;
1309
- }
1310
-
1311
- uchar *sr_get_norms(IndexReader *ir, char *field)
1312
- {
1313
- uchar *norms;
1314
- mutex_lock(&ir->mutex);
1315
- norms = sr_get_norms_internal(ir, field);
1316
- mutex_unlock(&ir->mutex);
1317
- return norms;
1318
- }
1319
-
1320
- static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
1321
- {
1322
- GET_SR;
1323
- uchar *bytes;
1324
- mutex_lock(&ir->mutex);
1325
-
1326
- bytes = sr_get_norms_internal(ir, field);
1327
- if (bytes == NULL) {
1328
- if (sr->fake_norms) {
1329
- bytes = sr->fake_norms;
1330
- } else {
1331
- int len = ir->max_doc(ir);
1332
- sr->fake_norms = bytes = ALLOC_N(uchar, len);
1333
- memset(bytes, 0, len);
1334
- }
1335
- }
1336
- mutex_unlock(&ir->mutex);
1337
- return bytes;
1338
- }
1339
-
1340
- void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1341
- {
1342
- GET_SR;
1343
- Norm *norm;
1344
-
1345
- norm = h_get(sr->norms, field);
1346
- if (norm != NULL) { /* an indexed field */
1347
- norm->is_dirty = true; /* mark it dirty */
1348
- sr->norms_dirty = true;
1349
-
1350
- sr_get_norms_internal(ir, field)[doc_num] = val;
1351
- }
1352
- }
1353
-
1354
- int sr_doc_freq(IndexReader *ir, Term *t)
1355
- {
1356
- GET_SR;
1357
- TermInfo *ti = tir_get_ti(sr->tir, t);
1358
- if (ti != NULL) {
1359
- int df = ti->doc_freq;
1360
- ti_destroy(ti);
1361
- return df;
1362
- } else {
1363
- return 0;
1364
- }
1365
- }
1366
-
1367
- Array *sr_file_names(IndexReader *ir)
1368
- {
1369
- GET_SR;
1370
- Array *file_names = ary_create(0, &free);
1371
- FieldInfo *fi;
1372
- int i;
1373
- char fname[SEGMENT_NAME_MAX_LENGTH];
1374
-
1375
- for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
1376
- sprintf(fname, "%s.%s", sr->segment, INDEX_EXTENSIONS[i]);
1377
- if (ir->store->exists(ir->store, fname))
1378
- ary_append(file_names, estrdup(fname));
1379
- }
1380
-
1381
- for (i = 0; i < sr->fis->fcnt; i++) {
1382
- fi = sr->fis->by_number[i];
1383
- if (fi->is_indexed && !fi->omit_norms) {
1384
- if (sr->cfs_store) {
1385
- sprintf(fname, "%s.s%d", sr->segment, i);
1386
- } else {
1387
- sprintf(fname, "%s.f%d", sr->segment, i);
1388
- }
1389
- if (ir->store->exists(ir->store, fname))
1390
- ary_append(file_names, estrdup(fname));
1391
- }
1392
- }
1393
- return file_names;
1394
- }
1395
-
1396
- HashSet *sr_get_field_names(IndexReader *ir, int field_type)
1397
- {
1398
- GET_SR;
1399
- int i;
1400
- HashSet *field_set = hs_str_create(NULL);
1401
- FieldInfo *fi;
1402
- for (i = 0; i < sr->fis->fcnt; i++) {
1403
- fi = sr->fis->by_number[i];
1404
- switch(field_type) {
1405
- case IR_ALL:
1406
- hs_add(field_set, fi->name);
1407
- break;
1408
- case IR_UNINDEXED:
1409
- if (!fi->is_indexed) hs_add(field_set, fi->name);
1410
- break;
1411
- case IR_INDEXED:
1412
- if (fi->is_indexed) hs_add(field_set, fi->name);
1413
- break;
1414
- case IR_INDEXED_NO_TERM_VECTOR:
1415
- if (fi->is_indexed && !fi->store_tv) hs_add(field_set, fi->name);
1416
- break;
1417
- case IR_TERM_VECTOR:
1418
- if (fi->store_tv && !fi->store_pos && !fi->store_offset)
1419
- hs_add(field_set, fi->name);
1420
- break;
1421
- case IR_INDEXED_WITH_TERM_VECTOR:
1422
- if (fi->is_indexed && fi->store_tv) hs_add(field_set, fi->name);
1423
- break;
1424
- case IR_TERM_VECTOR_WITH_POSITION:
1425
- if (fi->store_pos && !fi->store_offset) hs_add(field_set, fi->name);
1426
- break;
1427
- case IR_TERM_VECTOR_WITH_OFFSET:
1428
- if (!fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1429
- case IR_TERM_VECTOR_WITH_POSITION_OFFSET:
1430
- if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1431
- break;
1432
- default:
1433
- RAISE(ARG_ERROR, INVALID_FIELD_TYPE_MSG);
1434
- }
1435
- }
1436
- return field_set;
1437
- }
1438
-
1439
- int sr_num_docs(IndexReader *ir)
1440
- {
1441
- GET_SR;
1442
- int num_docs;
1443
-
1444
- mutex_lock(&ir->mutex);
1445
- num_docs = sr_max_doc(ir);
1446
- if (sr->deleted_docs != NULL)
1447
- num_docs -= sr->deleted_docs->count;
1448
- mutex_unlock(&ir->mutex);
1449
- return num_docs;
1450
- }
1451
-
1452
- TermDocEnum *sr_term_docs(IndexReader *ir)
1453
- {
1454
- return stde_create(ir);
1455
- }
1456
-
1457
- TermDocEnum *sr_term_positions(IndexReader *ir)
1458
- {
1459
- return stpe_create(ir);
1460
- }
1461
-
1462
- void sr_open_norms(IndexReader *ir, Store *cfs_store)
1463
- {
1464
- GET_SR;
1465
- int i;
1466
- FieldInfo *fi;
1467
- Store *tmp_store;
1468
- char fname[SEGMENT_NAME_MAX_LENGTH];
1469
- for (i = 0; i < sr->fis->fcnt; i++) {
1470
- tmp_store = ir->store;
1471
- fi = sr->fis->by_number[i];
1472
- if (fi->is_indexed && !fi->omit_norms) {
1473
- sprintf(fname, "%s.s%d", sr->segment, fi->number);
1474
- if (! tmp_store->exists(tmp_store, fname)) {
1475
- sprintf(fname, "%s.f%d", sr->segment, fi->number);
1476
- tmp_store = cfs_store;
1477
- }
1478
- h_set(sr->norms, fi->name,
1479
- norm_create(tmp_store->open_input(tmp_store, fname), fi->number));
1480
- }
1481
- }
1482
- sr->norms_dirty = false;
1483
- }
1484
-
1485
- TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1486
- {
1487
- GET_SR;
1488
- FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
1489
- TermVectorsReader *tvr;
1490
-
1491
- if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr))) {
1492
- return NULL;
1493
- }
1494
-
1495
- return tvr_get_field_tv(tvr, doc_num, field);
1496
- }
1497
-
1498
- Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
1499
- {
1500
- GET_SR;
1501
- TermVectorsReader *tvr;
1502
- if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL) {
1503
- return NULL;
1504
- }
1505
-
1506
- return tvr_get_tv(tvr, doc_num);
1507
- }
1508
-
1509
- void sr_commit(IndexReader *ir)
1510
- {
1511
- GET_SR;
1512
- char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1513
- char del_fname[SEGMENT_NAME_MAX_LENGTH];
1514
-
1515
- sprintf(del_fname, "%s.del", sr->segment);
1516
-
1517
- if (sr->deleted_docs_dirty) { /* re-write deleted */
1518
- sprintf(tmp_fname, "%s.tmp", sr->segment);
1519
- bv_write(sr->deleted_docs, ir->store, tmp_fname);
1520
- ir->store->rename(ir->store, tmp_fname, del_fname);
1521
- }
1522
- if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
1523
- ir->store->remove(ir->store, del_fname);
1524
- if (sr->norms_dirty) {/* re-write norms */
1525
- int i;
1526
- FieldInfo *fi;
1527
- for (i = 0; i < sr->fis->fcnt; i++) {
1528
- fi = sr->fis->by_number[i];
1529
- if (fi->is_indexed) {
1530
- norm_rewrite((Norm *)h_get(sr->norms, fi->name), ir->store,
1531
- sr->segment, sr_max_doc(ir), sr->cfs_store);
1532
- }
1533
- }
1534
- }
1535
- sr->deleted_docs_dirty = false;
1536
- sr->norms_dirty = false;
1537
- sr->undelete_all = false;
1538
- }
1539
-
1540
- IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
1541
- {
1542
- Store *store = si->store;
1543
- SegmentReader *sr = ALLOC(SegmentReader);
1544
- char fname[SEGMENT_NAME_MAX_LENGTH];
1545
-
1546
- ir->get_term_vector = &sr_get_term_vector;
1547
- ir->get_term_vectors = &sr_get_term_vectors;
1548
- ir->num_docs = &sr_num_docs;
1549
- ir->max_doc = &sr_max_doc;
1550
- ir->get_doc = &sr_get_doc;
1551
- ir->get_norms_into = &sr_get_norms_into;
1552
- ir->get_norms = &sr_get_norms;
1553
- ir->get_norms_always = &sr_get_norms_always;
1554
- ir->do_set_norm = &sr_set_norm;
1555
- ir->terms = &sr_terms;
1556
- ir->terms_from = &sr_terms_from;
1557
- ir->doc_freq = &sr_doc_freq;
1558
- ir->term_docs = &sr_term_docs;
1559
- ir->term_positions = &sr_term_positions;
1560
- ir->do_delete_doc = &sr_delete_doc;
1561
- ir->is_deleted = &sr_is_deleted;
1562
- ir->has_norms = &sr_has_norms;
1563
- ir->has_deletions = &sr_has_deletions;
1564
- ir->do_undelete_all = &sr_undelete_all;
1565
- ir->get_field_names = &sr_get_field_names;
1566
- ir->do_commit = &sr_commit;
1567
- ir->do_close = &sr_close;
1568
- ir->data = sr;
1569
- sr->segment = estrdup(si->name);
1570
- sr->cfs_store = NULL;
1571
- sr->fake_norms = NULL;
1572
- sprintf(fname, "%s.cfs", sr->segment);
1573
- if (store->exists(store, fname)) {
1574
- sr->cfs_store = open_cmpd_store(store, fname);
1575
- store = sr->cfs_store;
1576
- }
1577
-
1578
- sprintf(fname, "%s.fnm", sr->segment);
1579
-
1580
- sr->fis = fis_open(store, fname);
1581
- sr->fr = fr_open(store, sr->segment, sr->fis);
1582
-
1583
- sr->tir = tir_open(store, sr->segment, sr->fis);
1584
- sr->deleted_docs = NULL;
1585
- sr->deleted_docs_dirty = false;
1586
- sr->undelete_all = false;
1587
- if (si_has_deletions(si)) {
1588
- sprintf(fname, "%s.del", sr->segment);
1589
- sr->deleted_docs = bv_read(si->store, fname);
1590
- }
1591
-
1592
- sprintf(fname, "%s.frq", sr->segment);
1593
- sr->freq_in = store->open_input(store, fname);
1594
- sprintf(fname, "%s.prx", sr->segment);
1595
- sr->prox_in = store->open_input(store, fname);
1596
- sr->norms = h_new_str((free_ft)NULL, (free_ft)&norm_destroy);
1597
- sr_open_norms(ir, store);
1598
-
1599
- if (fis_has_vectors(sr->fis)) {
1600
- sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
1601
- thread_key_create(&sr->thread_tvr, NULL);
1602
- sr->tvr_bucket = ary_create(1, (free_ft)&tvr_close);
1603
- } else {
1604
- sr->orig_tvr = NULL;
1605
- }
1606
- return ir;
1607
- }
1608
-
1609
- IndexReader *sr_open_si(SegmentInfo *si)
1610
- {
1611
- IndexReader *ir = ir_create(si->store, NULL, false);
1612
- ref(si->store);
1613
- return sr_open_internal(ir, si);
1614
- }
1615
-
1616
- IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner)
1617
- {
1618
- SegmentInfo *si = sis->segs[si_num];
1619
- IndexReader *ir = ir_create(si->store, sis, is_owner);
1620
- ref(si->store);
1621
- return sr_open_internal(ir, si);
1622
- }
1623
-
1624
- /****************************************************************************
1625
- *
1626
- * MultiReader
1627
- *
1628
- ****************************************************************************/
1629
-
1630
- #define GET_MR MultiReader *mr = (MultiReader *)ir->data
1631
- #define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
1632
- int i = mr_reader_index(mr, doc_num);\
1633
- IndexReader *reader = mr->sub_readers[i]
1634
-
1635
-
1636
-
1637
- int mr_reader_index(MultiReader *mr, int doc_num)
1638
- {
1639
- int lo = 0; /* search @starts array */
1640
- int hi = mr->rcnt - 1; /* for first element less */
1641
- int mid;
1642
- int mid_value;
1643
-
1644
- while (hi >= lo) {
1645
- mid = (lo + hi) >> 1;
1646
- mid_value = mr->starts[mid];
1647
- if (doc_num < mid_value) {
1648
- hi = mid - 1;
1649
- } else if (doc_num > mid_value) {
1650
- lo = mid + 1;
1651
- } else { /* found a match */
1652
- while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
1653
- mid += 1; /* scan to last match in case we have empty segments */
1654
- return mid;
1655
- }
1656
- }
1657
- return hi;
1658
- }
1659
-
1660
- TermVector *mr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1661
- {
1662
- GET_READER(doc_num);
1663
- return reader->get_term_vector(reader, doc_num - mr->starts[i], field);
1664
- }
1665
-
1666
- Array *mr_get_term_vectors(IndexReader *ir, int doc_num)
1667
- {
1668
- GET_READER(doc_num);
1669
- return reader->get_term_vectors(reader, doc_num - mr->starts[i]);
1670
- }
1671
-
1672
- int mr_num_docs(IndexReader *ir)
1673
- {
1674
- int i, num_docs;
1675
- GET_MR;
1676
- mutex_lock(&ir->mutex);
1677
- if (mr->num_docs_cache == -1) {
1678
- IndexReader *reader;
1679
- mr->num_docs_cache = 0;
1680
- for (i = 0; i < mr->rcnt; i++) {
1681
- reader = mr->sub_readers[i];
1682
- mr->num_docs_cache += reader->num_docs(reader);
1683
- }
1684
- }
1685
- num_docs = mr->num_docs_cache;
1686
- mutex_unlock(&ir->mutex);
1687
-
1688
- return num_docs;
1689
- }
1690
-
1691
- int mr_max_doc(IndexReader *ir)
1692
- {
1693
- GET_MR;
1694
- return mr->max_doc;
1695
- }
1696
-
1697
- Document *mr_get_doc(IndexReader *ir, int doc_num)
1698
- {
1699
- GET_READER(doc_num);
1700
- return reader->get_doc(reader, doc_num - mr->starts[i]);
1701
- }
1702
-
1703
- void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1704
- {
1705
- int i;
1706
- uchar *bytes;
1707
- GET_MR;
1708
-
1709
- mutex_lock(&ir->mutex);
1710
- bytes = h_get(mr->norms_cache, field);
1711
- if (bytes != NULL) {
1712
- memcpy(buf + offset, bytes, mr->max_doc);
1713
- } else {
1714
- IndexReader *reader;
1715
- for (i = 0; i < mr->rcnt; i++) {
1716
- reader = mr->sub_readers[i];
1717
- reader->get_norms_into(reader, field, buf, offset + mr->starts[i]);
1718
- }
1719
- }
1720
- mutex_unlock(&ir->mutex);
1721
- }
1722
-
1723
- uchar *mr_get_norms(IndexReader *ir, char *field)
1724
- {
1725
- int i;
1726
- GET_MR;
1727
- uchar *bytes;
1728
- IndexReader *reader;
1729
-
1730
- mutex_lock(&ir->mutex);
1731
- bytes = h_get(mr->norms_cache, field);
1732
- if (bytes == NULL) {
1733
- bytes = ALLOC_N(uchar, mr->max_doc);
1734
-
1735
- for (i = 0; i < mr->rcnt; i++) {
1736
- reader = mr->sub_readers[i];
1737
- reader->get_norms_into(reader, field, bytes, mr->starts[i]);
1738
- }
1739
- h_set(mr->norms_cache, field, bytes); /* update cache */
1740
- }
1741
- mutex_unlock(&ir->mutex);
1742
-
1743
- return bytes;
1744
- }
1745
-
1746
- void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1747
- {
1748
- GET_READER(doc_num);
1749
- h_del(mr->norms_cache, field); /* clear cache */
1750
- ir_set_norm(reader, doc_num - mr->starts[i], field, val);
1751
- }
1752
-
1753
- TermEnum *mr_terms(IndexReader *ir)
1754
- {
1755
- GET_MR;
1756
- return mte_create(mr->sub_readers, mr->starts, mr->rcnt, NULL);
1757
- }
1758
-
1759
- TermEnum *mr_terms_from(IndexReader *ir, Term *term)
1760
- {
1761
- GET_MR;
1762
- return mte_create(mr->sub_readers, mr->starts, mr->rcnt, term);
1763
- }
1764
-
1765
- int mr_doc_freq(IndexReader *ir, Term *t)
1766
- {
1767
- int total = 0, i; /* sum freqs in segments */
1768
- GET_MR;
1769
-
1770
- IndexReader *reader;
1771
- for (i = 0; i < mr->rcnt; i++) {
1772
- reader = mr->sub_readers[i];
1773
- total += reader->doc_freq(reader, t);
1774
- }
1775
- return total;
1776
- }
1777
-
1778
- TermDocEnum *mr_term_docs(IndexReader *ir)
1779
- {
1780
- GET_MR;
1781
- return mtde_create(mr->sub_readers, mr->starts, mr->rcnt);
1782
- }
1783
-
1784
- TermDocEnum *mr_term_positions(IndexReader *ir)
1785
- {
1786
- GET_MR;
1787
- return mtpe_create(mr->sub_readers, mr->starts, mr->rcnt);
1788
- }
1789
-
1790
- void mr_delete_doc(IndexReader *ir, int doc_num)
1791
- {
1792
- GET_READER(doc_num);
1793
- mr->num_docs_cache = -1; /* invalidate cache */
1794
-
1795
- /* dispatch to segment reader */
1796
- reader->do_delete_doc(reader, doc_num - mr->starts[i]);
1797
- mr->has_deletions = true;
1798
- }
1799
-
1800
- bool mr_is_deleted(IndexReader *ir, int doc_num)
1801
- {
1802
- GET_READER(doc_num);
1803
- return reader->is_deleted(reader, doc_num - mr->starts[i]);
1804
- }
1805
-
1806
- bool mr_has_norms(IndexReader *ir, char *field)
1807
- {
1808
- bool has_norms = false;
1809
- int i;
1810
- GET_MR;
1811
-
1812
- IndexReader *reader;
1813
- for (i = 0; i < mr->rcnt; i++) {
1814
- reader = mr->sub_readers[i];
1815
- if (reader->has_norms(reader, field)) {
1816
- has_norms = true;
1817
- break;
1818
- }
1819
- }
1820
-
1821
- return has_norms;
1822
- }
1823
-
1824
- bool mr_has_deletions(IndexReader *ir)
1825
- {
1826
- GET_MR;
1827
- return mr->has_deletions;
1828
- }
1829
-
1830
- void mr_undelete_all(IndexReader *ir)
1831
- {
1832
- int i;
1833
- GET_MR;
1834
- IndexReader *reader;
1835
-
1836
- mr->num_docs_cache = -1; /* invalidate cache */
1837
- for (i = 0; i < mr->rcnt; i++) {
1838
- reader = mr->sub_readers[i];
1839
- reader->do_undelete_all(reader);
1840
- }
1841
- mr->has_deletions = false;
1842
- }
1843
-
1844
- HashSet *mr_get_field_names(IndexReader *ir, int field_type)
1845
- {
1846
- int i;
1847
- GET_MR;
1848
- HashSet *field_set = hs_str_create(NULL);
1849
- IndexReader *reader;
1850
- for (i = 0; i < mr->rcnt; i++) {
1851
- reader = mr->sub_readers[i];
1852
- hs_merge(field_set, reader->get_field_names(reader, field_type));
1853
- }
1854
- return field_set;
1855
- }
1856
-
1857
- void mr_commit(IndexReader *ir)
1858
- {
1859
- GET_MR;
1860
- int i;
1861
- IndexReader *reader;
1862
- for (i = 0; i < mr->rcnt; i++) {
1863
- reader = mr->sub_readers[i];
1864
- reader->do_commit(reader);
1865
- }
1866
- }
1867
-
1868
- void mr_close(IndexReader *ir)
1869
- {
1870
- GET_MR;
1871
- int i;
1872
- IndexReader *reader;
1873
- for (i = 0; i < mr->rcnt; i++) {
1874
- reader = mr->sub_readers[i];
1875
- ir_close(reader);
1876
- }
1877
- free(mr->sub_readers);
1878
- h_destroy(mr->norms_cache);
1879
- free(mr->starts);
1880
- free(mr);
1881
- }
1882
-
1883
- IndexReader *mr_open(Store *store,
1884
- SegmentInfos *sis,
1885
- IndexReader **sub_readers,
1886
- int rcnt)
1887
- {
1888
- int i;
1889
- MultiReader *mr = ALLOC(MultiReader);
1890
- IndexReader *sub_reader;
1891
- IndexReader *ir;
1892
- mr->sub_readers = sub_readers;
1893
- mr->rcnt = rcnt;
1894
-
1895
- mr->max_doc = 0;
1896
- mr->num_docs_cache = -1;
1897
- mr->has_deletions = false;
1898
-
1899
- mr->starts = ALLOC_N(int, (rcnt+1));
1900
- for (i = 0; i < rcnt; i++) {
1901
- sub_reader = sub_readers[i];
1902
- mr->starts[i] = mr->max_doc;
1903
- mr->max_doc += sub_reader->max_doc(sub_reader); /* compute max_docs */
1904
-
1905
- if (sub_reader->has_deletions(sub_reader)) {
1906
- mr->has_deletions = true;
1907
- }
1908
- }
1909
- mr->starts[rcnt] = mr->max_doc;
1910
- mr->norms_cache = h_new_str(NULL, &free);
1911
-
1912
- ir = ir_create(store, sis, true);
1913
- ir->get_term_vector = &mr_get_term_vector;
1914
- ir->get_term_vectors = &mr_get_term_vectors;
1915
- ir->num_docs = &mr_num_docs;
1916
- ir->max_doc = &mr_max_doc;
1917
- ir->get_doc = &mr_get_doc;
1918
- ir->get_norms_into = &mr_get_norms_into;
1919
- ir->get_norms = &mr_get_norms;
1920
- ir->get_norms_always = &mr_get_norms;
1921
- ir->do_set_norm = &mr_set_norm;
1922
- ir->terms = &mr_terms;
1923
- ir->terms_from = &mr_terms_from;
1924
- ir->doc_freq = &mr_doc_freq;
1925
- ir->term_docs = &mr_term_docs;
1926
- ir->term_positions = &mr_term_positions;
1927
- ir->do_delete_doc = &mr_delete_doc;
1928
- ir->is_deleted = &mr_is_deleted;
1929
- ir->has_norms = &mr_has_norms;
1930
- ir->has_deletions = &mr_has_deletions;
1931
- ir->do_undelete_all = &mr_undelete_all;
1932
- ir->get_field_names = &mr_get_field_names;
1933
- ir->do_commit = &mr_commit;
1934
- ir->do_close = &mr_close;
1935
- ir->data = mr;
1936
-
1937
- return ir;
1938
- }
1939
-
1940
- /****************************************************************************
1941
- *
1942
- * SegmentMergeInfo
1943
- *
1944
- ****************************************************************************/
1945
-
1946
- bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2)
1947
- {
1948
- int cmpres = tb_cmp(smi1->tb, smi2->tb);
1949
- if (cmpres == 0) {
1950
- return smi1->base < smi2->base;
1951
- } else {
1952
- return cmpres < 0;
1953
- }
1954
- }
1955
-
1956
- int *smi_load_doc_map(SegmentMergeInfo *smi)
1957
- {
1958
- IndexReader *ir = smi->ir;
1959
- if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
1960
- int max_doc = ir->max_doc(ir);
1961
- int j = 0, i;
1962
-
1963
- smi->doc_map = ALLOC_N(int, max_doc);
1964
- for (i = 0; i < max_doc; i++) {
1965
- if (ir->is_deleted(ir, i)) {
1966
- smi->doc_map[i] = -1;
1967
- } else {
1968
- smi->doc_map[i] = j++;
1969
- }
1970
- }
1971
- }
1972
- return smi->doc_map;
1973
- }
1974
-
1975
- SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
1976
- {
1977
- SegmentMergeInfo *smi = ALLOC(SegmentMergeInfo);
1978
- smi->base = base;
1979
- smi->ir = ir;
1980
- smi->te = te;
1981
- smi->tb = te->tb_curr;
1982
- smi->postings = ir->term_positions(ir);
1983
- smi->doc_map = NULL;
1984
- return smi;
1985
- }
1986
-
1987
- void smi_destroy(SegmentMergeInfo *smi)
1988
- {
1989
- smi->postings->close(smi->postings);
1990
- smi->te->close(smi->te);
1991
- if (smi->doc_map != NULL)
1992
- free(smi->doc_map);
1993
- free(smi);
1994
- }
1995
-
1996
- TermBuffer *smi_next(SegmentMergeInfo *smi)
1997
- {
1998
- return (smi->tb = smi->te->next(smi->te));
1999
- }
2000
-
2001
- /****************************************************************************
2002
- *
2003
- * SegmentMerger
2004
- *
2005
- ****************************************************************************/
2006
-
2007
- SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
2008
- {
2009
- SegmentMerger *sm = ALLOC(SegmentMerger);
2010
- sm->store = store;
2011
- sm->name = estrdup(name);
2012
- sm->readers = ary_create(config.merge_factor, (free_ft)&ir_close);
2013
- sm->fis = NULL;
2014
- sm->freq_out = NULL;
2015
- sm->prox_out = NULL;
2016
- sm->tiw = NULL;
2017
- sm->queue = NULL;
2018
- sm->ti = ti_create(0, 0, 0, 0);
2019
- sm->term_index_interval = term_index_interval;
2020
- sm->skip_buffer = ram_create_buffer();
2021
- sm->skip_interval = -1;
2022
- return sm;
2023
- }
2024
-
2025
- void sm_close(SegmentMerger *sm)
2026
- {
2027
- int i;
2028
- if (sm->freq_out != NULL) os_close(sm->freq_out);
2029
- if (sm->prox_out != NULL) os_close(sm->prox_out);
2030
- if (sm->tiw != NULL) {
2031
- for (i = 0; i < sm->terms_buf_size; i++) {
2032
- free(sm->terms_buf[i].text);
2033
- }
2034
- free(sm->terms_buf);
2035
- tiw_close(sm->tiw);
2036
- }
2037
- if (sm->queue != NULL) pq_destroy(sm->queue);
2038
- sm->freq_out = NULL;
2039
- sm->prox_out = NULL;
2040
- sm->tiw = NULL;
2041
- sm->queue = NULL;
2042
- }
2043
-
2044
- void sm_destroy(SegmentMerger *sm)
2045
- {
2046
- if (sm->fis != NULL) fis_destroy(sm->fis);
2047
- ary_destroy(sm->readers);
2048
- sm_close(sm);
2049
- free(sm->name);
2050
- ti_destroy(sm->ti);
2051
- ram_destroy_buffer(sm->skip_buffer);
2052
- free(sm);
2053
- }
2054
-
2055
- void sm_add(SegmentMerger *sm, IndexReader *ir)
2056
- {
2057
- ary_append(sm->readers, ir);
2058
- }
2059
-
2060
- static inline void sm_add_indexed(IndexReader *ir,
2061
- FieldInfos *fis,
2062
- HashSet *fields,
2063
- bool store_tv,
2064
- bool store_pos,
2065
- bool store_offset)
2066
- {
2067
- int i;
2068
- char *field;
2069
- for (i = 0; i < fields->size; i++) {
2070
- field = (char *)fields->elems[i];
2071
- fis_add(fis, field, true, store_tv, store_pos, store_offset,
2072
- !ir->has_norms(ir, field));
2073
- }
2074
- hs_destroy(fields);
2075
- }
2076
-
2077
- int sm_merge_fields(SegmentMerger *sm)
2078
- {
2079
- int i, j, maxdoc;
2080
- FieldInfos *fis = sm->fis = fis_create();
2081
- int doc_count = 0;
2082
- Document *doc;
2083
- FieldsWriter *fw;
2084
-
2085
- for (i = 0; i < sm->readers->size; i++) {
2086
- IndexReader *ir = sm->readers->elems[i];
2087
-
2088
- sm_add_indexed(ir, fis,
2089
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION_OFFSET),
2090
- true, true, true);
2091
- sm_add_indexed(ir, fis,
2092
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION),
2093
- true, true, false);
2094
- sm_add_indexed(ir, fis,
2095
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_OFFSET),
2096
- true, false, true);
2097
- sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_TERM_VECTOR),
2098
- true, false, false);
2099
- sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_INDEXED),
2100
- false, false, false);
2101
- fis_add_fields(fis, ir->get_field_names(ir, IR_UNINDEXED),
2102
- false, false, false, false, false);
2103
- }
2104
- fis_write(fis, sm->store, sm->name, ".fnm");
2105
-
2106
- /* merge field values */
2107
- fw = fw_open(sm->store, sm->name, fis);
2108
-
2109
- TRY
2110
- for (i = 0; i < sm->readers->size; i++) {
2111
- IndexReader *ir = sm->readers->elems[i];
2112
- maxdoc = ir->max_doc(ir);
2113
- for (j = 0; j < maxdoc; j++) {
2114
- if (!ir->is_deleted(ir, j)) { /* skip deleted docs */
2115
- doc = ir->get_doc(ir, j);
2116
- fw_add_doc(fw, doc);
2117
- doc_destroy(doc);
2118
- doc_count++;
2119
- }
2120
- }
2121
- }
2122
- XFINALLY
2123
- fw_close(fw);
2124
- XENDTRY
2125
- return doc_count;
2126
- }
2127
-
2128
- void sm_reset_skip(SegmentMerger *sm)
2129
- {
2130
- ramo_reset(sm->skip_buffer);
2131
- sm->last_skip_doc = 0;
2132
- sm->last_skip_freq_pointer = os_pos(sm->freq_out);
2133
- sm->last_skip_prox_pointer = os_pos(sm->prox_out);
2134
- }
2135
-
2136
- inline void sm_buffer_skip(SegmentMerger *sm, int doc)
2137
- {
2138
- int freq_pointer = os_pos(sm->freq_out);
2139
- int prox_pointer = os_pos(sm->prox_out);
2140
-
2141
- os_write_vint(sm->skip_buffer, doc - sm->last_skip_doc);
2142
- os_write_vint(sm->skip_buffer, freq_pointer - sm->last_skip_freq_pointer);
2143
- os_write_vint(sm->skip_buffer, prox_pointer - sm->last_skip_prox_pointer);
2144
-
2145
- sm->last_skip_doc = doc;
2146
- sm->last_skip_freq_pointer = freq_pointer;
2147
- sm->last_skip_prox_pointer = prox_pointer;
2148
- }
2149
-
2150
- int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2151
- {
2152
- int i, j;
2153
- int last_doc = 0, base, doc, doc_code, freq, last_position, position;
2154
- int *doc_map = NULL;
2155
- int df = 0; /* number of docs w/ term */
2156
- TermDocEnum *postings;
2157
- SegmentMergeInfo *smi;
2158
- sm_reset_skip(sm);
2159
- for (i = 0; i < cnt; i++) {
2160
- smi = smis[i];
2161
- postings = smi->postings;
2162
- base = smi->base;
2163
- doc_map = smi_load_doc_map(smi);
2164
-
2165
- stde_seek_ti(postings, smi->te->ti_curr);
2166
- while (postings->next(postings)) {
2167
- doc = postings->doc_num(postings);
2168
- if (doc_map != NULL) {
2169
- doc = doc_map[doc]; /* work around deletions */
2170
- }
2171
- doc += base; /* convert to merged space */
2172
-
2173
- if (doc < last_doc) {
2174
- RAISE(STATE_ERROR, DOC_ORDER_ERROR_MSG);
2175
- }
2176
-
2177
- df++;
2178
-
2179
- if ((df % sm->skip_interval) == 0) {
2180
- sm_buffer_skip(sm, last_doc);
2181
- }
2182
-
2183
- doc_code = (doc - last_doc) << 1; /* use low bit to flag freq=1 */
2184
- last_doc = doc;
2185
-
2186
- freq = postings->freq(postings);
2187
- if (freq == 1) {
2188
- os_write_vint(sm->freq_out, doc_code | 1); /* write doc & freq=1 */
2189
- } else {
2190
- os_write_vint(sm->freq_out, doc_code); /* write doc */
2191
- os_write_vint(sm->freq_out, freq); /* write freqency in doc */
2192
- }
2193
-
2194
-
2195
- last_position = 0; /* write position deltas */
2196
- for (j = 0; j < freq; j++) {
2197
- position = postings->next_position(postings);
2198
- os_write_vint(sm->prox_out, position - last_position);
2199
- last_position = position;
2200
- }
2201
- }
2202
- }
2203
- return df;
2204
- }
2205
-
2206
- int sm_write_skip(SegmentMerger *sm)
2207
- {
2208
- int skip_pointer = os_pos(sm->freq_out);
2209
- ramo_write_to(sm->skip_buffer, sm->freq_out);
2210
- return skip_pointer;
2211
- }
2212
-
2213
- Term *sm_tb_to_term(SegmentMerger *sm, TermBuffer *tb)
2214
- {
2215
- int index = sm->terms_buf_pointer % sm->terms_buf_size;
2216
- sm->terms_buf_pointer++;
2217
- sm->terms_buf[index].field = tb->field;
2218
- strcpy(sm->terms_buf[index].text, tb->text);
2219
- return &(sm->terms_buf[index]);
2220
- }
2221
-
2222
- void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2223
- {
2224
- int freq_pointer = os_pos(sm->freq_out);
2225
- int prox_pointer = os_pos(sm->prox_out);
2226
-
2227
- int df = sm_append_postings(sm, smis, cnt); /* append posting data */
2228
-
2229
- int skip_pointer = sm_write_skip(sm);
2230
-
2231
- if (df > 0) {
2232
- /* add an entry to the dictionary with pointers to prox and freq files */
2233
- ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
2234
- tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
2235
- }
2236
- }
2237
-
2238
- void sm_merge_term_infos(SegmentMerger *sm)
2239
- {
2240
- int base = 0;
2241
- int i, match_size;
2242
- IndexReader *ir;
2243
- TermEnum *te;
2244
- SegmentMergeInfo *smi, *top, **match;
2245
- TermBuffer *tb;
2246
-
2247
- for (i = 0; i < sm->readers->size; i++) {
2248
- ir = sm->readers->elems[i];
2249
- te = ir->terms(ir);
2250
- smi = smi_create(base, te, ir);
2251
- base += ir->num_docs(ir);
2252
- if (smi_next(smi) != NULL) {
2253
- pq_push(sm->queue, smi); /* initialize @queue */
2254
- } else {
2255
- smi_destroy(smi);
2256
- }
2257
- }
2258
-
2259
- match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
2260
-
2261
- while (sm->queue->count > 0) {
2262
- /*
2263
- for (i = 1; i <= sm->queue->count; i++) {
2264
- printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
2265
- ((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
2266
- }printf("\n\n");
2267
- */
2268
- match_size = 0; /* pop matching terms */
2269
- match[match_size] = pq_pop(sm->queue);
2270
- match_size++;
2271
- tb = match[0]->tb;
2272
- top = pq_top(sm->queue);
2273
- while ((top != NULL) && (tb_cmp(tb, top->tb) == 0)) {
2274
- match[match_size] = pq_pop(sm->queue);
2275
- match_size++;
2276
- top = pq_top(sm->queue);
2277
- }
2278
-
2279
- /* printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text); */
2280
- sm_merge_term_info(sm, match, match_size); /* add new TermInfo */
2281
-
2282
- while (match_size > 0) {
2283
- match_size--;
2284
- smi = match[match_size];
2285
- if (smi_next(smi) != NULL) {
2286
- pq_push(sm->queue, smi); /* restore queue */
2287
- } else {
2288
- smi_destroy(smi); /* done with a segment */
2289
- }
2290
- }
2291
- }
2292
- free(match);
2293
- }
2294
-
2295
- void sm_merge_terms(SegmentMerger *sm)
2296
- {
2297
- int i;
2298
- char fname[SEGMENT_NAME_MAX_LENGTH];
2299
-
2300
- TRY
2301
- sprintf(fname, "%s.frq", sm->name);
2302
- sm->freq_out = sm->store->create_output(sm->store, fname);
2303
- sprintf(fname, "%s.prx", sm->name);
2304
- sm->prox_out = sm->store->create_output(sm->store, fname);
2305
- sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
2306
- /* terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
2307
- * to keep the last index_interval terms so that it can compare the last term
2308
- * put in the index with the next one. So the size of the buffer must by
2309
- * index_interval + 2. */
2310
- sm->terms_buf_pointer = 0;
2311
- sm->terms_buf_size = sm->tiw->index_interval + 2;
2312
- sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
2313
- for (i = 0; i < sm->terms_buf_size; i++) {
2314
- sm->terms_buf[i].field = NULL;
2315
- sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
2316
- }
2317
- sm->skip_interval = sm->tiw->skip_interval;
2318
- sm->queue = pq_create(sm->readers->size, (lt_ft)&smi_lt);
2319
-
2320
- sm_merge_term_infos(sm);
2321
-
2322
- XFINALLY
2323
- sm_close(sm);
2324
- XENDTRY
2325
- }
2326
-
2327
- void sm_merge_norms(SegmentMerger *sm)
2328
- {
2329
- int i, j, k, max_doc;
2330
- uchar *norm_buf;
2331
- FieldInfo *fi;
2332
- OutStream *os;
2333
- char fname[SEGMENT_NAME_MAX_LENGTH];
2334
- IndexReader *ir;
2335
- for (i = 0; i < sm->fis->fcnt; i++) {
2336
- fi = sm->fis->by_number[i];
2337
- if (fi->is_indexed && !fi->omit_norms) {
2338
- sprintf(fname, "%s.f%d", sm->name, i);
2339
- os = sm->store->create_output(sm->store, fname);
2340
- TRY
2341
- for (j = 0; j < sm->readers->size; j++) {
2342
- ir = sm->readers->elems[j];
2343
- max_doc = ir->max_doc(ir);
2344
- norm_buf = ALLOC_N(uchar, max_doc);
2345
- memset(norm_buf, 0, sizeof(uchar) * max_doc);
2346
- ir->get_norms_into(ir, fi->name, norm_buf, 0);
2347
- for (k = 0; k < max_doc; k++) {
2348
- if (!ir->is_deleted(ir, k)) {
2349
- os_write_byte(os, norm_buf[k]);
2350
- }
2351
- }
2352
- free(norm_buf);
2353
- }
2354
- XFINALLY
2355
- os_close(os);
2356
- XENDTRY
2357
- }
2358
- }
2359
- }
2360
-
2361
- void sm_merge_vectors(SegmentMerger *sm)
2362
- {
2363
- int i, j, max_doc;
2364
- TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
2365
- IndexReader *ir;
2366
- Array *tvs;
2367
- TRY
2368
- for (i = 0; i < sm->readers->size; i++) {
2369
- ir = sm->readers->elems[i];
2370
- max_doc = ir->max_doc(ir);
2371
- for (j = 0; j < max_doc; j++) {
2372
- /* skip deleted docs */
2373
- if (! ir->is_deleted(ir, j)) {
2374
- tvs = ir->get_term_vectors(ir, j);
2375
- if (tvs) {
2376
- tvw_add_all_doc_vectors(tvw, tvs);
2377
- ary_destroy(tvs);
2378
- }
2379
- }
2380
- }
2381
- }
2382
- XFINALLY
2383
- tvw_close(tvw);
2384
- XENDTRY
2385
- }
2386
-
2387
- int sm_merge(SegmentMerger *sm)
2388
- {
2389
- int doc_count = sm_merge_fields(sm);
2390
- sm_merge_terms(sm);
2391
- sm_merge_norms(sm);
2392
- if (fis_has_vectors(sm->fis))
2393
- sm_merge_vectors(sm);
2394
- return doc_count;
2395
- }
2396
-
2397
- Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
2398
- {
2399
- Array *files = ary_create(0, &free);
2400
- CompoundWriter *cw = open_cw(sm->store, file_name);
2401
- FieldInfo *fi;
2402
- char fname[SEGMENT_NAME_MAX_LENGTH];
2403
-
2404
- int i;
2405
- for (i = 0; i < NELEMS(COMPOUND_EXTENSIONS); i++) {
2406
- sprintf(fname, "%s.%s", sm->name, COMPOUND_EXTENSIONS[i]);
2407
- ary_append(files, estrdup(fname));
2408
- }
2409
-
2410
- /* Field norm files */
2411
- for (i = 0; i < sm->fis->fcnt; i++) {
2412
- fi = sm->fis->by_number[i];
2413
- if (fi->is_indexed && !fi->omit_norms) {
2414
- sprintf(fname, "%s.f%d", sm->name, i);
2415
- ary_append(files, estrdup(fname));
2416
- }
2417
- }
2418
-
2419
- /* Vector files */
2420
- if (fis_has_vectors(sm->fis)) {
2421
- for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
2422
- sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
2423
- ary_append(files, estrdup(fname));
2424
- }
2425
- }
2426
-
2427
- /* Now merge all added files */
2428
- for (i = 0; i < files->size; i++) {
2429
- cw_add_file(cw, (char *)files->elems[i]);
2430
- }
2431
-
2432
- /* Perform the merge */
2433
- cw_close(cw);
2434
-
2435
- return files;
2436
- }
2437
-
2438
- /****************************************************************************
2439
- *
2440
- * IndexReader
2441
- *
2442
- ****************************************************************************/
2443
-
2444
- void ir_acquire_not_necessary(IndexReader *ir) {}
2445
- void ir_acquire_write_lock(IndexReader *ir)
2446
- {
2447
- if (ir->is_stale)
2448
- RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
2449
-
2450
- if (ir->write_lock == NULL) {
2451
- ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
2452
- if (!ir->write_lock->obtain(ir->write_lock)) /* obtain write lock */
2453
- RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
2454
-
2455
- /* we have to check whether index has changed since this reader was opened.
2456
- * if so, this reader is no longer valid for deletion */
2457
- if (sis_read_current_version(ir->store) > ir->sis->version) {
2458
- ir->is_stale = true;
2459
- ir->write_lock->release(ir->write_lock);
2460
- ir->store->close_lock(ir->write_lock);
2461
- ir->write_lock = NULL;
2462
- RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
2463
- }
2464
- }
2465
- }
2466
-
2467
- IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner)
2468
- {
2469
- IndexReader *ir = ALLOC(IndexReader);
2470
-
2471
- mutex_init(&ir->mutex, NULL);
2472
- ir->is_owner = is_owner;
2473
- if (is_owner) {
2474
- ir->acquire_write_lock = &ir_acquire_write_lock;
2475
- } else {
2476
- ir->acquire_write_lock = &ir_acquire_not_necessary;
2477
- }
2478
-
2479
- ir->store = store;
2480
- ir->sis = sis;
2481
- ir->has_changes = false;
2482
- ir->is_stale = false;
2483
- ir->write_lock = NULL;
2484
- ir->cache = NULL;
2485
- ir->sort_cache = NULL;
2486
- return ir;
2487
- }
2488
-
2489
- /**
2490
- * Will keep a reference to the store. To let this method delete the store
2491
- * make sure you deref the store that you pass to it
2492
- */
2493
- IndexReader *ir_open(Store *store)
2494
- {
2495
- int i;
2496
- IndexReader *ir;
2497
- SegmentInfos *sis;
2498
-
2499
- mutex_lock(&store->mutex);
2500
- sis = sis_create();
2501
- sis_read(sis, store);
2502
- if (sis->scnt == 1) {
2503
- ir = sr_open(sis, 0, true);
2504
- } else {
2505
- IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
2506
- for (i = 0; i < sis->scnt; i++) {
2507
- readers[i] = sr_open(sis, i, false);
2508
- }
2509
- ref(store);
2510
- ir = mr_open(store, sis, readers, sis->scnt);
2511
- }
2512
- mutex_unlock(&store->mutex);
2513
- return ir;
2514
- }
2515
-
2516
- bool ir_index_exists(Store *store)
2517
- {
2518
- return store->exists(store, "segments");
2519
- }
2520
-
2521
- void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
2522
- {
2523
- mutex_lock(&ir->mutex);
2524
- ir->acquire_write_lock(ir);
2525
- ir->do_set_norm(ir, doc_num, field, val);
2526
- ir->has_changes = true;
2527
- mutex_unlock(&ir->mutex);
2528
- }
2529
-
2530
- void ir_undelete_all(IndexReader *ir)
2531
- {
2532
- mutex_lock(&ir->mutex);
2533
- ir->acquire_write_lock(ir);
2534
- ir->do_undelete_all(ir);
2535
- ir->has_changes = true;
2536
- mutex_unlock(&ir->mutex);
2537
- }
2538
-
2539
- void ir_delete_doc(IndexReader *ir, int doc_num)
2540
- {
2541
- mutex_lock(&ir->mutex);
2542
- ir->acquire_write_lock(ir);
2543
- ir->do_delete_doc(ir, doc_num);
2544
- ir->has_changes = true;
2545
- mutex_unlock(&ir->mutex);
2546
- }
2547
-
2548
- Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
2549
- {
2550
- TermDocEnum *tde = ir_term_docs_for(ir, term);
2551
- Document *doc = NULL;
2552
-
2553
- if (!tde) return NULL;
2554
-
2555
- if (tde->next(tde)) {
2556
- doc = ir->get_doc(ir, tde->doc_num(tde));
2557
- }
2558
- tde->close(tde);
2559
- return doc;
2560
- }
2561
-
2562
- TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term)
2563
- {
2564
- TermDocEnum *tde = ir->term_docs(ir);
2565
- tde->seek(tde, term);
2566
- return tde;
2567
- }
2568
-
2569
- TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term)
2570
- {
2571
- TermDocEnum *tde = ir->term_positions(ir);
2572
- tde->seek(tde, term);
2573
- return tde;
2574
- }
2575
-
2576
- void ir_commit_internal(IndexReader *ir)
2577
- {
2578
- if (ir->has_changes) {
2579
- if (ir->is_owner) {
2580
- Lock *commit_lock;
2581
-
2582
- mutex_lock(&ir->store->mutex);
2583
- commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2584
- if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
2585
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
2586
- }
2587
-
2588
- ir->do_commit(ir);
2589
- sis_write(ir->sis, ir->store);
2590
-
2591
- commit_lock->release(commit_lock);
2592
- ir->store->close_lock(commit_lock);
2593
- mutex_unlock(&ir->store->mutex);
2594
-
2595
- if (ir->write_lock != NULL) {
2596
- ir->write_lock->release(ir->write_lock); /* release write lock */
2597
- ir->store->close_lock(ir->write_lock);
2598
- ir->write_lock = NULL;
2599
- }
2600
- } else {
2601
- ir->do_commit(ir);
2602
- }
2603
- ir->has_changes = false;
2604
- }
2605
- }
2606
-
2607
- void ir_commit(IndexReader *ir)
2608
- {
2609
- mutex_lock(&ir->mutex);
2610
- ir_commit_internal(ir);
2611
- mutex_unlock(&ir->mutex);
2612
- }
2613
-
2614
- void ir_close(IndexReader *ir)
2615
- {
2616
- mutex_lock(&ir->mutex);
2617
- ir_commit_internal(ir);
2618
- ir->do_close(ir);
2619
- store_deref(ir->store);
2620
- if (ir->is_owner) {
2621
- sis_destroy(ir->sis);
2622
- }
2623
- if (ir->cache) {
2624
- h_destroy(ir->cache);
2625
- }
2626
- if (ir->sort_cache) {
2627
- h_destroy(ir->sort_cache);
2628
- }
2629
-
2630
- mutex_destroy(&ir->mutex);
2631
- free(ir);
2632
- }
2633
-
2634
- /**
2635
- * Don't call this method if the cache already exists
2636
- **/
2637
- void ir_add_cache(IndexReader *ir)
2638
- {
2639
- ir->cache = co_hsh_create();
2640
- }
2641
-
2642
- bool ir_is_latest(IndexReader *ir)
2643
- {
2644
- bool is_latest = false;
2645
- Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2646
- if (!commit_lock->obtain(commit_lock)) {
2647
- ir->store->close_lock(commit_lock);
2648
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
2649
- }
2650
- TRY
2651
- is_latest = (sis_read_current_version(ir->store) == ir->sis->version);
2652
- XFINALLY
2653
- commit_lock->release(commit_lock);
2654
- ir->store->close_lock(commit_lock);
2655
- XENDTRY
2656
- return is_latest;
2657
- }
2658
-