ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/index_io.c DELETED
@@ -1,382 +0,0 @@
1
- #include <store.h>
2
- #include <string.h>
3
-
4
- static char * const STORE_EOF_ERROR_MSG = "EOF Error when trying to refill";
5
-
6
- Buffer *buf_create()
7
- {
8
- Buffer *buf = ALLOC(Buffer);
9
- buf->start = 0;
10
- buf->pos = 0;
11
- buf->len = 0;
12
- return buf;
13
- }
14
-
15
- void buf_destroy(Buffer *buf)
16
- {
17
- free(buf);
18
- }
19
-
20
- OutStream *os_create()
21
- {
22
- OutStream *os = ALLOC(OutStream);
23
- os->buf.start = 0;
24
- os->buf.pos = 0;
25
- os->buf.len = 0;
26
- return os;
27
- }
28
-
29
- inline void os_flush(OutStream *os)
30
- {
31
- os->flush_internal(os, os->buf.buf, os->buf.pos);
32
- os->buf.start += os->buf.pos;
33
- os->buf.pos = 0;
34
- }
35
-
36
- void os_close(OutStream *os)
37
- {
38
- os_flush(os);
39
- os->close_internal(os);
40
- free(os);
41
- }
42
-
43
- int os_pos(OutStream *os)
44
- {
45
- return os->buf.start + os->buf.pos;
46
- }
47
-
48
- void os_seek(OutStream *os, int new_pos)
49
- {
50
- os_flush(os);
51
- os->buf.start = new_pos;
52
- os->seek_internal(os, new_pos);
53
- }
54
-
55
- #define write_byte(os, b) os->buf.buf[os->buf.pos++] = b
56
-
57
- inline void os_write_byte(OutStream *os, uchar b)
58
- {
59
- if (os->buf.pos >= BUFFER_SIZE) {
60
- os_flush(os);
61
- }
62
- write_byte(os, b);
63
- }
64
-
65
- void os_write_bytes(OutStream *os, uchar *b, int len)
66
- {
67
- if (os->buf.pos > 0) { /* flush buffer */
68
- os_flush(os);
69
- }
70
-
71
- if (len < BUFFER_SIZE) {
72
- os->flush_internal(os, b, len);
73
- os->buf.start += len;
74
- } else {
75
- int pos = 0;
76
- int size;
77
- while (pos < len) {
78
- if (len - pos < BUFFER_SIZE) {
79
- size = len - pos;
80
- } else {
81
- size = BUFFER_SIZE;
82
- }
83
- os->flush_internal(os, b + pos, size);
84
- pos += size;
85
- os->buf.start += size;
86
- }
87
- }
88
- }
89
-
90
- InStream *is_create()
91
- {
92
- InStream *is = ALLOC(InStream);
93
- is->buf.start = 0;
94
- is->buf.pos = 0;
95
- is->buf.len = 0;
96
- return is;
97
- }
98
-
99
- void is_refill(InStream *is)
100
- {
101
- int start = is->buf.start + is->buf.pos;
102
- int last = start + BUFFER_SIZE;
103
- int flen = is->length_internal(is);
104
- if (last > flen) { /* don't read past EOF */
105
- last = flen;
106
- }
107
-
108
- is->buf.len = last - start;
109
- if (is->buf.len <= 0) {
110
- RAISE(EOF_ERROR, STORE_EOF_ERROR_MSG);
111
- }
112
-
113
- is->read_internal(is, is->buf.buf, 0, is->buf.len);
114
-
115
- is->buf.start = start;
116
- is->buf.pos = 0;
117
- }
118
-
119
- #define read_byte(is) is->buf.buf[is->buf.pos++]
120
- inline uchar is_read_byte(InStream *is)
121
- {
122
- if (is->buf.pos >= is->buf.len) {
123
- is_refill(is);
124
- }
125
-
126
- return read_byte(is);
127
- }
128
-
129
- int is_pos(InStream *is)
130
- {
131
- return is->buf.start + is->buf.pos;
132
- }
133
-
134
- uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len)
135
- {
136
- int i, start;
137
- if ((offset + len) < BUFFER_SIZE) {
138
- for (i = offset; i < offset + len; i++) {
139
- b[i] = is_read_byte(is);
140
- }
141
- } else { // read all-at-once
142
- start = is_pos(is);
143
- is->seek_internal(is, start);
144
- is->read_internal(is, b, offset, len);
145
-
146
- is->buf.start = start + len; // adjust stream variables
147
- is->buf.pos = 0;
148
- is->buf.len = 0; // trigger refill on read
149
- }
150
- return b;
151
- }
152
-
153
- void is_seek(InStream *is, int pos)
154
- {
155
- if (pos >= is->buf.start && pos < (is->buf.start + is->buf.len)) {
156
- is->buf.pos = pos - is->buf.start; // seek within buffer
157
- } else {
158
- is->buf.start = pos;
159
- is->buf.pos = 0;
160
- is->buf.len = 0; // trigger refill() on read()
161
- is->seek_internal(is, pos);
162
- }
163
- }
164
-
165
- void is_close(InStream *is)
166
- {
167
- is->close_internal(is);
168
- free(is);
169
- }
170
-
171
- InStream *is_clone(InStream *is)
172
- {
173
- InStream *new_index_i = ALLOC(InStream);
174
- memcpy(new_index_i, is, sizeof(InStream));
175
- new_index_i->is_clone = true;
176
- is->clone_internal(is, new_index_i);
177
- return new_index_i;
178
- }
179
-
180
- int
181
- is_read_int(InStream *is)
182
- {
183
- return ((int)is_read_byte(is) << 24) |
184
- ((int)is_read_byte(is) << 16) |
185
- ((int)is_read_byte(is) << 8) |
186
- (int)is_read_byte(is);
187
- }
188
-
189
- llong
190
- is_read_long(InStream *is)
191
- {
192
- return ((llong)is_read_byte(is) << 56) |
193
- ((llong)is_read_byte(is) << 48) |
194
- ((llong)is_read_byte(is) << 40) |
195
- ((llong)is_read_byte(is) << 32) |
196
- ((llong)is_read_byte(is) << 24) |
197
- ((llong)is_read_byte(is) << 16) |
198
- ((llong)is_read_byte(is) << 8) |
199
- (llong)is_read_byte(is);
200
- }
201
-
202
- unsigned int
203
- is_read_uint(InStream *is)
204
- {
205
- return ((unsigned int)is_read_byte(is) << 24) |
206
- ((unsigned int)is_read_byte(is) << 16) |
207
- ((unsigned int)is_read_byte(is) << 8) |
208
- (unsigned int)is_read_byte(is);
209
- }
210
-
211
- ullong
212
- is_read_ulong(InStream *is)
213
- {
214
- return ((ullong)is_read_byte(is) << 56) |
215
- ((ullong)is_read_byte(is) << 48) |
216
- ((ullong)is_read_byte(is) << 40) |
217
- ((ullong)is_read_byte(is) << 32) |
218
- ((ullong)is_read_byte(is) << 24) |
219
- ((ullong)is_read_byte(is) << 16) |
220
- ((ullong)is_read_byte(is) << 8) |
221
- (ullong)is_read_byte(is);
222
- }
223
-
224
- /* optimized to use unchecked read_byte if there is definitely space */
225
- inline ullong
226
- is_read_vint(InStream *is)
227
- {
228
- register ullong res, b;
229
- register int shift = 7;
230
-
231
- if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
232
- b = is_read_byte(is);
233
- res = b & 0x7F; // 0x7F = 0b01111111
234
-
235
- while ((b & 0x80) != 0) {// 0x80 = 0b10000000
236
- b = is_read_byte(is);
237
- res |= (b & 0x7F) << shift;
238
- shift += 7;
239
- }
240
- } else { // unchecked
241
- b = read_byte(is);
242
- res = b & 0x7F; // 0x7F = 0b01111111
243
-
244
- while ((b & 0x80) != 0) {// 0x80 = 0b10000000
245
- b = read_byte(is);
246
- res |= (b & 0x7F) << shift;
247
- shift += 7;
248
- }
249
- }
250
-
251
- return res;
252
- }
253
-
254
- inline void
255
- is_skip_vints(InStream *is, register int cnt)
256
- {
257
- for (; cnt > 0; cnt--) {
258
- while ((is_read_byte(is) & 0x80) != 0) {
259
- }
260
- }
261
- }
262
-
263
- inline void
264
- is_read_chars(InStream *is, char* buffer, int off, int len)
265
- {
266
- int end, i;
267
-
268
- end = off + len;
269
-
270
- for(i = off; i < end; i++) {
271
- buffer[i] = is_read_byte(is);
272
- }
273
- }
274
-
275
- char *
276
- is_read_string(InStream *is)
277
- {
278
- register int length = (int)is_read_vint(is);
279
- char *str = ALLOC_N(char, length + 1);
280
- str[length] = '\0';
281
-
282
- if (is->buf.pos > (is->buf.len - length)) {
283
- register int i;
284
- for(i = 0; i < length; i++) {
285
- str[i] = is_read_byte(is);
286
- }
287
- } else { // unchecked
288
- memcpy(str, is->buf.buf + is->buf.pos, length);
289
- is->buf.pos += length;
290
- }
291
- //is_read_chars(is, str, 0, length);
292
-
293
- return str;
294
- }
295
-
296
- void
297
- os_write_int(OutStream *os, int l)
298
- {
299
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
300
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
301
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
302
- os_write_byte(os, (uchar)(l & 0xFF));
303
- }
304
-
305
- void
306
- os_write_long(OutStream *os, llong l)
307
- {
308
- os_write_byte(os, (uchar)((l >> 56) & 0xFF));
309
- os_write_byte(os, (uchar)((l >> 48) & 0xFF));
310
- os_write_byte(os, (uchar)((l >> 40) & 0xFF));
311
- os_write_byte(os, (uchar)((l >> 32) & 0xFF));
312
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
313
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
314
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
315
- os_write_byte(os, (uchar)(l & 0xFF));
316
- }
317
-
318
- void
319
- os_write_uint(OutStream *os, unsigned int l)
320
- {
321
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
322
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
323
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
324
- os_write_byte(os, (uchar)(l & 0xFF));
325
- }
326
-
327
- void
328
- os_write_ulong(OutStream *os, ullong l)
329
- {
330
- os_write_byte(os, (uchar)((l >> 56) & 0xFF));
331
- os_write_byte(os, (uchar)((l >> 48) & 0xFF));
332
- os_write_byte(os, (uchar)((l >> 40) & 0xFF));
333
- os_write_byte(os, (uchar)((l >> 32) & 0xFF));
334
- os_write_byte(os, (uchar)((l >> 24) & 0xFF));
335
- os_write_byte(os, (uchar)((l >> 16) & 0xFF));
336
- os_write_byte(os, (uchar)((l >> 8) & 0xFF));
337
- os_write_byte(os, (uchar)(l & 0xFF));
338
- }
339
-
340
- /* optimized to use an unchecked write if there is space */
341
- inline void
342
- os_write_vint(OutStream *os, register ullong i)
343
- {
344
- if (os->buf.pos > VINT_END) {
345
- while (i > 127) {
346
- os_write_byte(os, (uchar)((i & 0x7f) | 0x80));
347
- i >>= 7;
348
- }
349
- os_write_byte(os, (uchar)(i));
350
- } else {
351
- while (i > 127) {
352
- write_byte(os, (uchar)((i & 0x7f) | 0x80));
353
- i >>= 7;
354
- }
355
- write_byte(os, (uchar)(i));
356
- }
357
- }
358
-
359
- void
360
- os_write_chars(OutStream *os, char *buf, int start, int length)
361
- {
362
- int i;
363
-
364
- for (i = start; i < start + length; i++) {
365
- os_write_byte(os, buf[i]);
366
- }
367
- }
368
-
369
- void
370
- os_write_string(OutStream *os, char *str)
371
- {
372
- int len = (int)strlen(str);
373
- os_write_vint(os, len);
374
-
375
- os_write_chars(os, str, 0, len);
376
- }
377
-
378
- int file_is_lock(char *filename)
379
- {
380
- int start = (int)strlen(filename) - 4;
381
- return ((start > 0) && (strcmp(".lck", &filename[start]) == 0));
382
- }
data/ext/index_rw.c DELETED
@@ -1,2658 +0,0 @@
1
- #include "index.h"
2
- #include <stdlib.h>
3
- #include <string.h>
4
- #include <array.h>
5
- static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
6
- static char * const WRITE_LOCK_ERROR_MSG = "Could not obtain write lock when trying to write index";
7
- static char * const COMMIT_LOCK_ERROR_MSG = "Could not obtain commit lock when trying to write index";
8
- static char * const DELETED_DOC_ERROR_MSG = "Tried to get doc that has already been deleted";
9
- static char * const INVALID_FIELD_TYPE_MSG = "Invalid field-type";
10
- static char * const DOC_ORDER_ERROR_MSG = "docs out of order curent";
11
- static char * const STALE_READER_ERROR_MSG = "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations";
12
-
13
- const char *INDEX_EXTENSIONS[] = {
14
- "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
15
- "tvx", "tvd", "tvf", "tvp"
16
- };
17
-
18
- const char *COMPOUND_EXTENSIONS[] = {
19
- "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
20
- };
21
-
22
- const char *VECTOR_EXTENSIONS[] = {
23
- "tvx", "tvd", "tvf"
24
- };
25
-
26
- FerretConfig config = {
27
- 10, /* default merge_factor */
28
- 10, /* default min_merge_docs */
29
- INT_MAX, /* default max_merge_docs */
30
- 10000, /* default max_field_length */
31
- 128 /* default term_index_interval */
32
- };
33
-
34
- /***************************************************************************
35
- *
36
- * CacheObject
37
- *
38
- ***************************************************************************/
39
-
40
- unsigned int co_hash(const void *key)
41
- {
42
- return (unsigned int)key;
43
- }
44
-
45
- int co_eq(const void *key1, const void *key2)
46
- {
47
- return (key1 == key2);
48
- }
49
-
50
- void co_destroy(CacheObject *self)
51
- {
52
- h_rem(self->ref_tab1, self->ref2, false);
53
- h_rem(self->ref_tab2, self->ref1, false);
54
- self->destroy(self->obj);
55
- free(self);
56
- }
57
-
58
- CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
59
- void *ref1, void *ref2, free_ft destroy, void *obj)
60
- {
61
- CacheObject *self = ALLOC(CacheObject);
62
- h_set(ref_tab1, ref2, self);
63
- h_set(ref_tab2, ref1, self);
64
- self->ref_tab1 = ref_tab1;
65
- self->ref_tab2 = ref_tab2;
66
- self->ref1 = ref1;
67
- self->ref2 = ref2;
68
- self->destroy = destroy;
69
- self->obj = obj;
70
- return self;
71
- }
72
-
73
- HshTable *co_hsh_create()
74
- {
75
- return h_new(&co_hash, &co_eq, (free_ft)NULL, (free_ft)&co_destroy);
76
- }
77
-
78
- /***************************************************************************
79
- *
80
- * Posting
81
- *
82
- ***************************************************************************/
83
-
84
- Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
85
- {
86
- Posting *self = ALLOC(Posting);
87
- self->freq = 1;
88
- self->size = 1;
89
- self->term = term;
90
- self->positions = ALLOC(int);
91
- self->positions[0] = position;
92
- self->offsets = ALLOC(TVOffsetInfo *);
93
- self->offsets[0] = offset;
94
- return self;
95
- }
96
-
97
- void p_destroy(Posting *self)
98
- {
99
- /* the positions and offsets will be put in a TVTerm so no need to free */
100
- int i;
101
- free(self->positions);
102
- for (i = 0; i < self->freq; i++)
103
- tvoi_destroy(self->offsets[i]);
104
- free(self->offsets);
105
- free(self);
106
- }
107
-
108
- void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset)
109
- {
110
- if (self->freq >= self->size) {
111
- self->size *= 2;
112
- REALLOC_N(self->positions, int, self->size);
113
- REALLOC_N(self->offsets, TVOffsetInfo *, self->size);
114
- }
115
- self->positions[self->freq] = position;
116
- self->offsets[self->freq] = offset;
117
- self->freq++;
118
- }
119
-
120
- inline int p_cmp(const void *const p1, const void *const p2)
121
- {
122
- Term *t1 = (*(Posting **)p1)->term;
123
- Term *t2 = (*(Posting **)p2)->term;
124
- int res = strcmp(t1->field, t2->field);
125
- if (res != 0) {
126
- return res;
127
- } else {
128
- return strcmp(t1->text, t2->text);
129
- }
130
- }
131
-
132
- DocumentWriter *dw_open(Store *store,
133
- Analyzer *analyzer,
134
- Similarity *similarity,
135
- int max_field_length,
136
- int term_index_interval)
137
- {
138
- DocumentWriter *self = ALLOC(DocumentWriter);
139
- self->store = store;
140
- self->analyzer = analyzer;
141
- self->similarity = similarity;
142
- self->fis = NULL;
143
- self->postingtable = h_new(&term_hash, &term_eq,
144
- (free_ft)&term_destroy,
145
- (free_ft)&p_destroy);
146
- self->max_field_length = max_field_length;
147
- self->term_index_interval = term_index_interval;
148
- return self;
149
- }
150
-
151
- void dw_close(DocumentWriter *self)
152
- {
153
- if (self->fis) fis_destroy(self->fis);
154
- h_destroy(self->postingtable);
155
- free(self);
156
- }
157
-
158
- void dw_add_position(DocumentWriter *self, char *field, char *text,
159
- int position, TVOffsetInfo *offset)
160
- {
161
- Term termbuf = {field, text}, *term;
162
- Posting *p = (Posting *)h_get(self->postingtable, &termbuf);
163
-
164
- if (p) { /* word seen before */
165
- if (p->freq >= p->size) {
166
- /* double size of posting to make room for more posts. */
167
- p->size <<= 1;
168
- REALLOC_N(p->positions, int, p->size);
169
- p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
170
- }
171
- p->positions[p->freq] = position; /* add new position */
172
- p->offsets[p->freq] = offset; /* add new offset */
173
- p->freq++; /* update frequency */
174
- } else { /* word not seen before */
175
- term = term_create(field, text);
176
- h_set(self->postingtable, term, p_create(term, position, offset));
177
- }
178
- }
179
-
180
- void dw_invert_doc(DocumentWriter *self, Document *doc)
181
- {
182
- int i;
183
- int dfcnt = doc->dfcnt;
184
- char *field_name, *text;
185
- int field_number, length, position, offset, slen;
186
- TokenStream *stream;
187
- Token *token;
188
- FieldInfo *fi;
189
- char text_buf[MAX_WORD_SIZE];
190
- text_buf[MAX_WORD_SIZE - 1] = '\0';
191
-
192
- DocField **fields = doc->df_arr, *field;
193
- for (i = 0; i < dfcnt; i++) {
194
- field = fields[i];
195
- field_name = field->name;
196
- fi = ((FieldInfo *)ht_get(self->fis->by_name, field_name));
197
- field_number = fi->number;
198
-
199
- length = self->field_lengths[field_number];
200
- offset = self->field_offsets[field_number];
201
- position = self->field_positions[field_number];
202
-
203
- if (fi->is_indexed) {
204
- if (!field->is_tokenized) { /* un-tokenized field */
205
- text = field->data;
206
- slen = (int)strlen(text);
207
- if (slen >= MAX_WORD_SIZE) {
208
- slen = MAX_WORD_SIZE - 1;
209
- text = strncpy(text_buf, text, MAX_WORD_SIZE - 1);
210
- }
211
- if (fi->store_offset) {
212
- dw_add_position(self, field_name, text, position,
213
- tvoi_create(offset, offset+slen));
214
- } else {
215
- dw_add_position(self, field_name, text, position, NULL);
216
- }
217
- offset += slen;
218
- length++;
219
- } else {
220
-
221
- /* Tokenize field and add to posting_table */
222
- stream = a_get_ts(self->analyzer, field_name, field->data);
223
-
224
- while ((token = ts_next(stream)) != NULL) {
225
- position += (token->pos_inc - 1);
226
-
227
- if (fi->store_offset) {
228
- dw_add_position(self,
229
- field_name,
230
- token->text,
231
- position,
232
- tvoi_create(offset + token->start, offset + token->end));
233
- position++;
234
- } else {
235
- dw_add_position(self, field_name, token->text, position, NULL);
236
- position++;
237
- }
238
-
239
- length++;
240
- /* stop if we reach the max field length */
241
- if (length > self->max_field_length) {
242
- break;
243
- }
244
- }
245
-
246
- if (token) {
247
- offset += token->end + 1;
248
- }
249
- }
250
- self->field_lengths[field_number] = length;
251
- self->field_offsets[field_number] = offset;
252
- self->field_positions[field_number] = position;
253
- self->field_boosts[field_number] *= field->boost;
254
- }
255
- }
256
- }
257
-
258
- Posting **dw_sort_posting_table(DocumentWriter *self)
259
- {
260
- HshTable *ht = self->postingtable;
261
- HshEntry *he = ht->table;
262
- Posting **postings;
263
- int i;
264
-
265
- self->pcnt = i = ht->used;
266
- postings = ALLOC_N(Posting *, i);
267
-
268
- while (i > 0) {
269
- if (he->value != NULL) {
270
- i--;
271
- postings[i] = (Posting *)he->value;
272
- }
273
- he++;
274
- }
275
- qsort(postings, self->pcnt, sizeof(Posting *), &p_cmp);
276
- return postings;
277
- }
278
-
279
- void dw_write_postings(DocumentWriter *self, Posting **postings, char *segment)
280
- {
281
- OutStream * volatile freq_out = NULL, * volatile prox_out = NULL;
282
- TermInfosWriter * volatile tiw = NULL;
283
- TermVectorsWriter * volatile tvw = NULL;
284
- Store *store = self->store;
285
- TermInfo * volatile ti = NULL;
286
- Posting *posting;
287
- int i, j, posting_freq, position, last_position;
288
- char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
289
- strcpy(fname, segment);
290
-
291
- TRY
292
- /* open files for inverse index storage */
293
- sprintf(fname, "%s.frq", segment);
294
- freq_out = store->create_output(store, fname);
295
- sprintf(fname, "%s.prx", segment);
296
- prox_out = store->create_output(store, fname);
297
- tiw = tiw_open(store, segment, self->fis, self->term_index_interval);
298
- ti = ti_create(0, 0, 0, 0);
299
-
300
- for (i = 0; i < self->pcnt; i++) {
301
- posting = postings[i];
302
-
303
- /* add an entry to dictionary with pointers to prox and freq_out files */
304
- ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
305
- tiw_add(tiw, posting->term, ti);
306
-
307
- /* add an entry to the freq_out file */
308
- posting_freq = posting->freq;
309
- if (posting_freq == 1) { /* optimize freq=1 */
310
- os_write_vint(freq_out, 1); /* set low bit of doc num */
311
- } else {
312
- os_write_vint(freq_out, 0); /* the doc number */
313
- os_write_vint(freq_out, posting_freq); /* frequency in doc */
314
- }
315
-
316
- last_position = 0; /* write positions */
317
-
318
- for (j = 0; j < posting_freq; j++) {
319
- position = posting->positions[j];
320
- os_write_vint(prox_out, position - last_position);
321
- last_position = position;
322
- }
323
-
324
- /* check to see if we switched to a new field */
325
- term_field = posting->term->field;
326
- if (curr_field != term_field) {
327
- FieldInfo *fi;
328
- /* changing field - see if there is something to save */
329
- curr_field = term_field;
330
- fi = (FieldInfo *)ht_get(self->fis->by_name, curr_field);
331
- if (fi->store_tv) {
332
- if (tvw == NULL) {
333
- tvw = tvw_open(store, segment, self->fis);
334
- tvw_open_doc(tvw);
335
- }
336
- tvw_open_field(tvw, curr_field);
337
-
338
- } else if (tvw != NULL) {
339
- tvw_close_field(tvw);
340
- }
341
- }
342
- /* tvw->curr_field != NULL implies field is still open */
343
- if (tvw != NULL && tvw->curr_field != NULL) {
344
- tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
345
- }
346
- }
347
- XFINALLY
348
- if (tvw) {
349
- tvw_close_doc(tvw);
350
- tvw_close(tvw);
351
- }
352
- /* make an effort to close all streams we can but remember and re-raise
353
- * the last exception encountered in this process */
354
- if (freq_out) os_close(freq_out);
355
- if (prox_out) os_close(prox_out);
356
- if (tiw) tiw_close(tiw);
357
- if (ti) ti_destroy(ti);
358
- XENDTRY
359
- }
360
-
361
- void dw_write_norms(DocumentWriter *self, char *segment)
362
- {
363
- int i;
364
- float norm;
365
- OutStream *norms_out;
366
- char fname[SEGMENT_NAME_MAX_LENGTH];
367
- FieldInfos *fis = self->fis;
368
- FieldInfo *fi;
369
-
370
- for (i = 0; i < fis->fcnt; i++) {
371
- fi = fis->by_number[i];
372
-
373
- if (fi->is_indexed && !fi->omit_norms) {
374
- norm = self->field_boosts[i] *
375
- sim_length_norm(self->similarity, fi->name, self->field_lengths[i]);
376
- sprintf(fname, "%s.f%d", segment, i);
377
- norms_out = self->store->create_output(self->store, fname);
378
- TRY
379
- os_write_byte(norms_out, sim_encode_norm(self->similarity, norm));
380
- XFINALLY
381
- os_close(norms_out);
382
- XENDTRY
383
- }
384
- }
385
- }
386
-
387
- void dw_add_doc(DocumentWriter *self, char *segment, Document *doc)
388
- {
389
- Posting **postings;
390
- FieldsWriter *fw;
391
- int i;
392
-
393
- /* write field names */
394
- self->fis = fis_create();
395
- fis_add_doc(self->fis, doc);
396
- fis_write(self->fis, self->store, segment, ".fnm");
397
-
398
- /* write field values */
399
- fw = fw_open(self->store, segment, self->fis);
400
- TRY
401
- fw_add_doc(fw, doc);
402
- XFINALLY
403
- fw_close(fw);
404
- XENDTRY
405
-
406
- /* invert doc into posting_table */
407
-
408
- h_clear(self->postingtable); /* clear posting_table */
409
-
410
- self->field_boosts = ALLOC_N(float, self->fis->fcnt);
411
- self->field_lengths = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
412
- self->field_offsets = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
413
- self->field_positions = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
414
-
415
- for (i = 0; i < self->fis->fcnt; i++) {
416
- self->field_boosts[i] = doc->boost;
417
- }
418
-
419
- dw_invert_doc(self, doc);
420
-
421
- /* sort posting_table into an array */
422
- postings = dw_sort_posting_table(self);
423
-
424
- /* write postings */
425
- dw_write_postings(self, postings, segment);
426
- free(postings);
427
-
428
- /* write norms of indexed fields */
429
- dw_write_norms(self, segment);
430
-
431
- free(self->field_boosts);
432
- free(self->field_lengths);
433
- free(self->field_offsets);
434
- free(self->field_positions);
435
- }
436
-
437
- /****************************************************************************
438
- *
439
- * SegmentInfo
440
- *
441
- ****************************************************************************/
442
-
443
- SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
444
- {
445
- SegmentInfo *si = ALLOC(SegmentInfo);
446
- si->name = name;
447
- si->doc_cnt = doc_cnt;
448
- si->store = store;
449
- return si;
450
- }
451
-
452
- void si_destroy(SegmentInfo *si)
453
- {
454
- free(si->name);
455
- free(si);
456
- }
457
-
458
- bool si_has_deletions(SegmentInfo *si)
459
- {
460
- char del_file_name[SEGMENT_NAME_MAX_LENGTH];
461
- sprintf(del_file_name, "%s.del", si->name);
462
- return si->store->exists(si->store, del_file_name);
463
- }
464
-
465
- bool si_uses_compound_file(SegmentInfo *si)
466
- {
467
- char compound_file_name[SEGMENT_NAME_MAX_LENGTH];
468
- sprintf(compound_file_name, "%s.cfs", si->name);
469
- return si->store->exists(si->store, compound_file_name);
470
- }
471
-
472
- struct NormTester {
473
- bool has_norm_file;
474
- char *segment_name;
475
- };
476
- void is_norm_file(char *fname, void *arg)
477
- {
478
- struct NormTester *nt = (struct NormTester *)arg;
479
- char norm_file_pattern[SEGMENT_NAME_MAX_LENGTH];
480
- sprintf(norm_file_pattern, "%s.s", nt->segment_name);
481
- if (strncmp(fname, norm_file_pattern, strlen(norm_file_pattern)) == 0) {
482
- nt->has_norm_file = true;
483
- }
484
- }
485
-
486
- bool si_has_separate_norms(SegmentInfo *si)
487
- {
488
- struct NormTester nt;
489
- nt.segment_name = si->name;
490
- nt.has_norm_file = false;
491
- si->store->each(si->store, &is_norm_file, &nt);
492
-
493
- return nt.has_norm_file;
494
- }
495
-
496
-
497
- /****************************************************************************
498
- *
499
- * SegmentInfos
500
- *
501
- ****************************************************************************/
502
-
503
- #include <time.h>
504
- #define FORMAT -1
505
- #define SEGMENT_FILENAME "segments"
506
- #define TEMPORARY_SEGMENT_FILENAME "segments.new"
507
-
508
- SegmentInfos *sis_create()
509
- {
510
- SegmentInfos *sis = ALLOC(SegmentInfos);
511
- sis->format = FORMAT;
512
- sis->version = (unsigned int)time(NULL);
513
- sis->scnt = 0;
514
- sis->counter = 0;
515
- sis->size = 4;
516
- sis->segs = ALLOC_N(SegmentInfo *, sis->size);
517
- return sis;
518
- }
519
-
520
- void sis_destroy_not_infos(SegmentInfos *sis)
521
- {
522
- free(sis->segs);
523
- free(sis);
524
- }
525
-
526
- void sis_destroy(SegmentInfos *sis)
527
- {
528
- int i;
529
- for (i = 0; i < sis->scnt; i++)
530
- si_destroy(sis->segs[i]);
531
- free(sis->segs);
532
- free(sis);
533
- }
534
-
535
- void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
536
- {
537
- if (sis->scnt >= sis->size) {
538
- sis->size = sis->scnt * 2;
539
- REALLOC_N(sis->segs, SegmentInfo *, sis->size);
540
- }
541
- sis->segs[sis->scnt] = si;
542
- sis->scnt++;
543
- }
544
-
545
- void sis_del_at(SegmentInfos *sis, int at)
546
- {
547
- int i;
548
- si_destroy(sis->segs[at]);
549
- sis->scnt--;
550
- for (i = at; i < sis->scnt; i++) {
551
- sis->segs[i] = sis->segs[i+1];
552
- }
553
- }
554
-
555
- void sis_del_from_to(SegmentInfos *sis, int from, int to)
556
- {
557
- int i, num_to_del = to - from;
558
- sis->scnt -= num_to_del;
559
- for (i = from; i < to; i++) {
560
- si_destroy(sis->segs[i]);
561
- }
562
- for (i = from; i < sis->scnt; i++) {
563
- sis->segs[i] = sis->segs[i+num_to_del];
564
- }
565
- }
566
-
567
- void sis_clear(SegmentInfos *sis)
568
- {
569
- int i;
570
- for (i = 0; i < sis->scnt; i++) {
571
- si_destroy(sis->segs[i]);
572
- }
573
- sis->scnt = 0;
574
- }
575
-
576
- void sis_read(SegmentInfos *sis, Store *store)
577
- {
578
- int doc_cnt;
579
- int seg_count;
580
- int i;
581
- char *name;
582
- InStream *is = store->open_input(store, SEGMENT_FILENAME);
583
-
584
- TRY
585
-
586
- sis->format = is_read_int(is);
587
- if (sis->format < 0) { /* file contains explicit format info */
588
- /* check that it is a format we can understand */
589
- if (sis->format < FORMAT)
590
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
591
- sis->version = (uint)is_read_long(is);
592
- sis->counter = (int)is_read_int(is);
593
- } else { /* file is in old format without explicit format info */
594
- sis->counter = sis->format;
595
- }
596
-
597
- seg_count = is_read_int(is);
598
- for (i = 0; i < seg_count; i++) {
599
- name = is_read_string(is);
600
- doc_cnt = is_read_int(is);
601
- sis_add_si(sis, si_create(name, doc_cnt, store));
602
- }
603
-
604
- if (sis->format >= 0) {
605
- /* in old format the version number may be at the end of the file */
606
- if (is_pos(is) >= is_length(is)) {
607
- sis->version = 0; /* old file format without version number */
608
- } else {
609
- sis->version = (int)is_read_long(is); /* read version */
610
- }
611
- }
612
- XFINALLY
613
- is_close(is);
614
- XENDTRY
615
- }
616
-
617
- void sis_write(SegmentInfos *sis, Store *store)
618
- {
619
- int i;
620
- SegmentInfo *si;
621
- OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
622
- TRY
623
- os_write_int(os, FORMAT);
624
- os_write_long(os, ++(sis->version)); /* every write changes the index */
625
- os_write_int(os, sis->counter);
626
- os_write_int(os, sis->scnt);
627
- for (i = 0; i < sis->scnt; i++) {
628
- si = sis->segs[i];
629
- os_write_string(os, si->name);
630
- os_write_int(os, si->doc_cnt);
631
- }
632
-
633
- XFINALLY
634
- os_close(os);
635
- XENDTRY
636
-
637
- /* install new segment info */
638
- store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
639
- }
640
-
641
- int sis_read_current_version(Store *store)
642
- {
643
- InStream *is;
644
- SegmentInfos *sis;
645
- int format = 0;
646
- int version = 0;
647
-
648
- if (!store->exists(store, SEGMENT_FILENAME))
649
- return 0;
650
- is = store->open_input(store, SEGMENT_FILENAME);
651
-
652
- TRY
653
- format = is_read_int(is);
654
- if (format < 0) {
655
- if (format < FORMAT)
656
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
657
- version = (int)is_read_long(is);
658
- }
659
- XFINALLY
660
- is_close(is);
661
- XENDTRY
662
-
663
- if (format < 0)
664
- return version;
665
-
666
- /* We cannot be sure about the format of the file.
667
- * Therefore we have to read the whole file and cannot simply
668
- * seek to the version entry. */
669
-
670
- sis = sis_create();
671
- sis_read(sis, store);
672
- version = sis->version;
673
- sis_destroy(sis);
674
- return version;
675
- }
676
-
677
- /****************************************************************************
678
- *
679
- * IndexWriter
680
- *
681
- ****************************************************************************/
682
-
683
- /**
684
- * Deletes the analyzer by default but leaves the store by default
685
- */
686
- IndexWriter *iw_open(Store *store, Analyzer *analyzer, bool create)
687
- {
688
- IndexWriter *iw = ALLOC(IndexWriter);
689
- if (create)
690
- store->clear_all(store);
691
- mutex_init(&iw->mutex, NULL);
692
- iw->merge_factor = config.merge_factor;
693
- iw->min_merge_docs = config.min_merge_docs;
694
- iw->max_merge_docs = config.max_merge_docs;
695
- iw->max_field_length = config.max_field_length;
696
- iw->term_index_interval = config.term_index_interval;
697
- iw->use_compound_file = true;
698
- iw->store = store;
699
- ref(store);
700
- iw->analyzer = analyzer;
701
- iw->sis = sis_create();
702
- iw->similarity = sim_create_default();
703
- iw->ram_store = open_ram_store();
704
-
705
- mutex_lock(&store->mutex);
706
- /* keep the write_lock obtained until the IndexWriter is closed. */
707
- iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
708
- if (!iw->write_lock->obtain(iw->write_lock)) {
709
- RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
710
- }
711
-
712
- if (create) {
713
- Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
714
- if (!commit_lock->obtain(commit_lock)) {
715
- store->close_lock(commit_lock);
716
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
717
- }
718
- TRY
719
- /* commit the index */
720
- store->clear(store);
721
- sis_write(iw->sis, store);
722
- XFINALLY
723
- commit_lock->release(commit_lock);
724
- store->close_lock(commit_lock);
725
- XENDTRY
726
- } else {
727
- sis_read(iw->sis, store);
728
- }
729
- mutex_unlock(&store->mutex);
730
- return iw;
731
- }
732
-
733
- const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
734
-
735
- char *new_segment_name(int counter)
736
- {
737
- char buf[SEGMENT_NAME_MAX_LENGTH];
738
- int i;
739
-
740
- buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
741
- for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
742
- buf[i] = base36_digitmap[counter%36];
743
- counter /= 36;
744
- if (counter == 0) break;
745
- }
746
- i--;
747
- buf[i] = '_';
748
- return estrdup(&buf[i]);
749
- }
750
-
751
- int iw_doc_count(IndexWriter *iw)
752
- {
753
- int i, doc_cnt = 0;
754
- mutex_lock(&iw->mutex);
755
- for (i = 0; i < iw->sis->scnt; i++)
756
- doc_cnt += iw->sis->segs[i]->doc_cnt;
757
- mutex_unlock(&iw->mutex);
758
- return doc_cnt;
759
- }
760
-
761
- void delete_files(Array *file_names, Store *store)
762
- {
763
- int i;
764
- for (i = 0; i < file_names->size; i++) {
765
- store->remove(store, (char *)file_names->elems[i]);
766
- }
767
- ary_destroy(file_names);
768
- }
769
-
770
-
771
- Array *sr_file_names(IndexReader *ir);
772
- void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
773
- {
774
- /* The java version keeps a record of files that it couldn't delete. This
775
- * shouldn't be a problem on linux I hope. */
776
- IndexReader *ir;
777
- int i;
778
- for (i = 0; i < del_cnt; i++) {
779
- ir = segment_readers[i];
780
- delete_files(sr_file_names(ir), ir->store);
781
- }
782
- }
783
-
784
- void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
785
- {
786
- Array *files_to_delete;
787
- Lock *commit_lock;
788
- char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
789
-
790
- mutex_lock(&iw->store->mutex);
791
- sprintf(merged_tmp, "%s.tmp", merged_name);
792
- sprintf(merged_cfs, "%s.cfs", merged_name);
793
-
794
- files_to_delete = sm_create_compound_file(merger, merged_tmp);
795
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
796
-
797
- if (!commit_lock->obtain(commit_lock)) {
798
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
799
- }
800
-
801
- /* make compound file visible for SegmentReaders */
802
- iw->store->rename(iw->store, merged_tmp, merged_cfs);
803
-
804
- /* delete now unused files of segment */
805
- delete_files(files_to_delete, iw->store);
806
-
807
- commit_lock->release(commit_lock);
808
- iw->store->close_lock(commit_lock);
809
- mutex_unlock(&iw->store->mutex);
810
- }
811
-
812
- void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
813
- {
814
- int i;
815
- int merged_doc_count;
816
- Lock *commit_lock;
817
- IndexReader **segments_to_delete = ALLOC_N(IndexReader *, max_segment - min_segment);
818
- int del_cnt = 0;
819
-
820
- char *merged_name = new_segment_name(iw->sis->counter++);
821
-
822
- SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
823
- IndexReader *reader;
824
-
825
-
826
- for (i = min_segment; i < max_segment; i++) {
827
- reader = sr_open(iw->sis, i, false);
828
- sm_add(merger, reader);
829
- if ((reader->store == iw->store) || /* if we own the directory */
830
- (reader->store == iw->ram_store)) {
831
- segments_to_delete[del_cnt++] = reader; /* queue segment for deletion */
832
- }
833
- }
834
-
835
- merged_doc_count = sm_merge(merger);
836
-
837
- sis_del_from_to(iw->sis, min_segment, max_segment);
838
-
839
- sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
840
-
841
- /* close readers before we attempt to delete now-obsolete segments */
842
-
843
- mutex_lock(&iw->store->mutex);
844
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
845
- if (!commit_lock->obtain(commit_lock)) {
846
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
847
- }
848
- /* commit the index */
849
- sis_write(iw->sis, iw->store);
850
- iw_delete_segments(iw, segments_to_delete, del_cnt);
851
-
852
- commit_lock->release(commit_lock);
853
- iw->store->close_lock(commit_lock);
854
- mutex_unlock(&iw->store->mutex);
855
-
856
- if (iw->use_compound_file) {
857
- make_compound_file(iw, merged_name, merger);
858
- }
859
-
860
- free(segments_to_delete);
861
- sm_destroy(merger);
862
- }
863
-
864
- void iw_merge_segments(IndexWriter *iw, int min_segment)
865
- {
866
- iw_merge_segments_with_max(iw, min_segment, iw->sis->scnt);
867
- }
868
-
869
- void iw_maybe_merge_segments(IndexWriter *iw)
870
- {
871
- int target_merge_docs = iw->min_merge_docs;
872
- int min_segment, merge_docs;
873
- SegmentInfo *si;
874
-
875
- while (target_merge_docs <= iw->max_merge_docs) {
876
- /* find segments smaller than current target size */
877
- min_segment = iw->sis->scnt - 1;
878
- merge_docs = 0;
879
- while (min_segment >= 0) {
880
- si = iw->sis->segs[min_segment];
881
- if (si->doc_cnt >= target_merge_docs) {
882
- break;
883
- }
884
- merge_docs += si->doc_cnt;
885
- min_segment -= 1;
886
- }
887
-
888
- if (merge_docs >= target_merge_docs) { /* found a merge to do */
889
- iw_merge_segments(iw, min_segment + 1);
890
- } else {
891
- break;
892
- }
893
-
894
- target_merge_docs *= iw->merge_factor; /* increase target size */
895
- }
896
- }
897
-
898
- void iw_flush_ram_segments(IndexWriter *iw)
899
- {
900
- int min_segment = iw->sis->scnt-1;
901
- int doc_count = 0;
902
- SegmentInfo **segs = iw->sis->segs;
903
- while ((min_segment >= 0) &&
904
- (segs[min_segment]->store == iw->ram_store)) {
905
- doc_count += segs[min_segment]->doc_cnt;
906
- min_segment--;
907
- }
908
- /* the following if statement is actually incrementing for different
909
- * reasons. If min_segment < 0 then we must increment as we searched
910
- * off the end. If the top segment is not ram_store there are no
911
- * ram segments to flush so we increment so the next check will return
912
- * us from this function. Lastly, the min_segment stopped at a segment
913
- * that wasn't the ram segment. But if it fit's in with the merge
914
- * factor, why not merge it. Otherwise we leave it and increment min_seg
915
- */
916
- if ((min_segment < 0) || /* add one FS segment? */
917
- ((doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor) ||
918
- (segs[iw->sis->scnt - 1]->store != iw->ram_store)) {
919
- min_segment++;
920
- }
921
- if (min_segment >= iw->sis->scnt) {
922
- return;
923
- }
924
- iw_merge_segments(iw, min_segment);
925
- }
926
-
927
- void iw_add_doc(IndexWriter *iw, Document *doc)
928
- {
929
- DocumentWriter *dw;
930
- char *segment_name;
931
-
932
- mutex_lock(&iw->mutex);
933
- dw = dw_open(iw->ram_store,
934
- iw->analyzer,
935
- iw->similarity,
936
- iw->max_field_length,
937
- iw->term_index_interval);
938
- segment_name = new_segment_name(iw->sis->counter++);
939
- dw_add_doc(dw, segment_name, doc);
940
- dw_close(dw);
941
- sis_add_si(iw->sis, si_create(segment_name, 1, iw->ram_store));
942
- iw_maybe_merge_segments(iw);
943
- mutex_unlock(&iw->mutex);
944
- }
945
-
946
- static inline void iw_optimize_internal(IndexWriter *iw)
947
- {
948
- int min_segment;
949
- iw_flush_ram_segments(iw);
950
- while (iw->sis->scnt > 1 ||
951
- (iw->sis->scnt == 1 &&
952
- ( si_has_deletions(iw->sis->segs[0]) ||
953
- (iw->sis->segs[0]->store != iw->store) ||
954
- (iw->use_compound_file &&
955
- (!si_uses_compound_file(iw->sis->segs[0]) ||
956
- si_has_separate_norms(iw->sis->segs[0])))))) {
957
- min_segment = iw->sis->scnt - iw->merge_factor;
958
- iw_merge_segments(iw, min_segment < 0 ? 0 : min_segment);
959
- }
960
- }
961
- void iw_optimize(IndexWriter *iw)
962
- {
963
- mutex_lock(&iw->mutex);
964
- iw_optimize_internal(iw);
965
- mutex_unlock(&iw->mutex);
966
- }
967
-
968
- void iw_close(IndexWriter *iw)
969
- {
970
- mutex_lock(&iw->mutex);
971
- iw_flush_ram_segments(iw);
972
- store_deref(iw->ram_store);
973
- sis_destroy(iw->sis);
974
-
975
- sim_destroy(iw->similarity);
976
- a_deref(iw->analyzer);
977
-
978
- iw->write_lock->release(iw->write_lock);
979
- iw->store->close_lock(iw->write_lock);
980
-
981
- store_deref(iw->store);
982
- mutex_destroy(&iw->mutex);
983
- free(iw);
984
- }
985
-
986
- void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
987
- {
988
- int i, j, end, start;
989
-
990
- mutex_lock(&iw->mutex);
991
- iw_optimize_internal(iw); /* start with zero or 1 seg */
992
-
993
- start = iw->sis->scnt;
994
-
995
- for (i = 0; i < cnt; i++) {
996
- Store *store = stores[i];
997
- SegmentInfos *sis = sis_create(); /* read infos from dir */
998
- sis_read(sis, store);
999
-
1000
- for (j = 0; j < sis->scnt; j++) {
1001
- SegmentInfo *si = sis->segs[j];
1002
- sis_add_si(iw->sis, si);
1003
- }
1004
- sis_destroy_not_infos(sis);
1005
- }
1006
-
1007
- /* merge newly added segments in log(n) passes */
1008
- while (iw->sis->scnt > start + iw->merge_factor) {
1009
- for (i = start + 1; i < iw->sis->scnt; i++) {
1010
- end = MIN(iw->sis->scnt, i + iw->merge_factor);
1011
- if (end - i > 1) {
1012
- iw_merge_segments_with_max(iw, i, end);
1013
- }
1014
- }
1015
- }
1016
-
1017
- /* final cleanup */
1018
- iw_optimize_internal(iw);
1019
- mutex_unlock(&iw->mutex);
1020
- }
1021
-
1022
-
1023
- /**
1024
- * This adds an array of readers to the index leaving the added readers open.
1025
- */
1026
- void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
1027
- {
1028
- IndexReader *ir = NULL;
1029
- int i, del_cnt = 0;
1030
- int doc_count;
1031
- char *merged_name;
1032
- SegmentMerger *merger;
1033
- Lock *commit_lock;
1034
-
1035
- mutex_lock(&iw->mutex);
1036
- iw_optimize_internal(iw); /* start with zero or 1 seg */
1037
-
1038
- merged_name = new_segment_name(iw->sis->counter++);
1039
-
1040
- merger = sm_create(iw->store, merged_name, iw->term_index_interval);
1041
- merger->readers->free_elem = NULL; /* don't close readers */
1042
-
1043
- if (iw->sis->scnt == 1) { /* add existing index, if any */
1044
- ir = sr_open_si(iw->sis->segs[0]);
1045
- sm_add(merger, ir);
1046
- del_cnt = 1;
1047
- }
1048
-
1049
- for (i = 0; i < cnt; i++) {
1050
- sm_add(merger, irs[i]);
1051
- }
1052
-
1053
- doc_count = sm_merge(merger); /* merge 'em */
1054
-
1055
- /* pop old infos and add new ones. */
1056
- sis_clear(iw->sis);
1057
- sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
1058
-
1059
-
1060
- commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
1061
- if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
1062
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
1063
- }
1064
-
1065
- sis_write(iw->sis, iw->store); /* commit changes */
1066
- iw_delete_segments(iw, &ir, del_cnt);
1067
- if (ir) ir_close(ir);
1068
-
1069
- commit_lock->release(commit_lock);
1070
- iw->store->close_lock(commit_lock);
1071
-
1072
- if (iw->use_compound_file) {
1073
- make_compound_file(iw, merged_name, merger);
1074
- }
1075
-
1076
- iw_optimize_internal(iw);
1077
- sm_destroy(merger);
1078
-
1079
- mutex_unlock(&iw->mutex);
1080
- }
1081
-
1082
- /****************************************************************************
1083
- *
1084
- * Norm
1085
- *
1086
- ****************************************************************************/
1087
-
1088
- Norm *norm_create(InStream *is, int field_num)
1089
- {
1090
- Norm *norm = ALLOC(Norm);
1091
- norm->is = is;
1092
- norm->field_num = field_num;
1093
- norm->bytes = NULL;
1094
- norm->is_dirty = false;
1095
- return norm;
1096
- }
1097
-
1098
- void norm_destroy(Norm *norm)
1099
- {
1100
- is_close(norm->is);
1101
- if (norm->bytes != NULL) {
1102
- free(norm->bytes);
1103
- }
1104
- free(norm);
1105
- }
1106
-
1107
- void norm_rewrite(Norm *norm, Store *store, char *segment,
1108
- int doc_count, Store *cfs_store)
1109
- {
1110
- OutStream *os;
1111
- char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1112
- char norm_fname[SEGMENT_NAME_MAX_LENGTH];
1113
-
1114
- if (norm->bytes == NULL) {
1115
- return; /* These norms do not need to be rewritten */
1116
- }
1117
-
1118
- sprintf(tmp_fname, "%s.tmp", segment);
1119
- os = store->create_output(store, tmp_fname);
1120
- TRY
1121
- os_write_bytes(os, norm->bytes, doc_count);
1122
- XFINALLY
1123
- os_close(os);
1124
- XENDTRY
1125
- if (cfs_store) {
1126
- sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
1127
- } else {
1128
- sprintf(norm_fname, "%s.f%d", segment, norm->field_num);
1129
- }
1130
- store->rename(store, tmp_fname, norm_fname);
1131
- norm->is_dirty = false;
1132
- }
1133
-
1134
- /****************************************************************************
1135
- *
1136
- * SegmentReader
1137
- *
1138
- ****************************************************************************/
1139
-
1140
- #define GET_SR SegmentReader *sr = (SegmentReader *)ir->data
1141
-
1142
- int sr_max_doc(IndexReader *ir)
1143
- {
1144
- return ((SegmentReader *)ir->data)->fr->len;
1145
- }
1146
-
1147
- static inline void sr_close_norms(SegmentReader *sr)
1148
- {
1149
- h_destroy(sr->norms);
1150
- }
1151
-
1152
- static inline TermVectorsReader *sr_tvr(SegmentReader *sr)
1153
- {
1154
- TermVectorsReader *tvr;
1155
- if ((tvr = thread_getspecific(sr->thread_tvr)) == NULL) {
1156
- tvr = tvr_clone(sr->orig_tvr);
1157
- if (tvr == NULL) printf("scuk\n");
1158
- ary_append(sr->tvr_bucket, tvr);
1159
- thread_setspecific(sr->thread_tvr, tvr);
1160
- }
1161
- return tvr;
1162
- }
1163
-
1164
- void sr_close(IndexReader *ir)
1165
- {
1166
- GET_SR;
1167
- fr_close(sr->fr);
1168
- tir_close(sr->tir);
1169
-
1170
- if (sr->freq_in) is_close(sr->freq_in);
1171
- if (sr->prox_in) is_close(sr->prox_in);
1172
-
1173
- fis_destroy(sr->fis);
1174
- sr_close_norms(sr);
1175
-
1176
- if (sr->orig_tvr) {
1177
- tvr_close(sr->orig_tvr);
1178
- thread_key_delete(sr->thread_tvr);
1179
- ary_destroy(sr->tvr_bucket);
1180
- }
1181
- if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
1182
- if (sr->cfs_store) store_deref(sr->cfs_store);
1183
- if (sr->fake_norms) free(sr->fake_norms);
1184
- free(sr->segment);
1185
- free(sr);
1186
- }
1187
-
1188
- void sr_delete_doc(IndexReader *ir, int doc_num)
1189
- {
1190
- GET_SR;
1191
- if (sr->deleted_docs == NULL)
1192
- sr->deleted_docs = bv_create();
1193
-
1194
- sr->deleted_docs_dirty = true;
1195
- sr->undelete_all = false;
1196
- bv_set(sr->deleted_docs, doc_num);
1197
- }
1198
-
1199
- static inline bool sr_is_deleted_internal(IndexReader *ir, int doc_num)
1200
- {
1201
- GET_SR;
1202
- return (sr->deleted_docs != NULL && bv_get(sr->deleted_docs, doc_num));
1203
- }
1204
-
1205
- bool sr_is_deleted(IndexReader *ir, int doc_num)
1206
- {
1207
- bool is_del;
1208
-
1209
- mutex_lock(&ir->mutex);
1210
- is_del = sr_is_deleted_internal(ir, doc_num);
1211
- mutex_unlock(&ir->mutex);
1212
-
1213
- return is_del;
1214
- }
1215
-
1216
- bool sr_has_norms(IndexReader *ir, char *field)
1217
- {
1218
- GET_SR;
1219
- bool has_norms;
1220
- mutex_lock(&ir->mutex);
1221
- has_norms = h_has_key(sr->norms, field);
1222
- mutex_unlock(&ir->mutex);
1223
-
1224
- return has_norms;
1225
- }
1226
-
1227
- bool sr_has_deletions(IndexReader *ir)
1228
- {
1229
- GET_SR;
1230
- return (sr->deleted_docs != NULL);
1231
- }
1232
-
1233
- void sr_undelete_all(IndexReader *ir)
1234
- {
1235
- GET_SR;
1236
- sr->undelete_all = true;
1237
- sr->deleted_docs_dirty = false;
1238
- if (sr->deleted_docs != NULL) bv_destroy(sr->deleted_docs);
1239
- sr->deleted_docs = NULL;
1240
- }
1241
-
1242
- TermEnum *sr_terms(IndexReader *ir)
1243
- {
1244
- TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1245
- return te->clone(te);
1246
- }
1247
-
1248
- TermEnum *sr_terms_from(IndexReader *ir, Term *term)
1249
- {
1250
- TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1251
- TermEnum *ret_te = te->clone(te);
1252
- te_skip_to(ret_te, term);
1253
- return ret_te;
1254
- }
1255
-
1256
- Document *sr_get_doc(IndexReader *ir, int doc_num)
1257
- {
1258
- GET_SR;
1259
- Document *doc;
1260
- mutex_lock(&ir->mutex);
1261
- if (sr_is_deleted_internal(ir, doc_num)) {
1262
- mutex_unlock(&ir->mutex);
1263
- RAISE(STATE_ERROR, DELETED_DOC_ERROR_MSG);
1264
- }
1265
- doc = fr_get_doc(sr->fr, doc_num);
1266
- mutex_unlock(&ir->mutex);
1267
- return doc;
1268
- }
1269
-
1270
- static inline void
1271
- sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
1272
- {
1273
- GET_SR;
1274
- Norm *norm = h_get(sr->norms, field);
1275
- if (norm == NULL) {
1276
- memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
1277
- } else if (norm->bytes != NULL) { /* can copy from cache */
1278
- memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
1279
- } else {
1280
- InStream *norm_in = is_clone(norm->is);
1281
- /* read from disk */
1282
- is_seek(norm_in, 0);
1283
- is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
1284
- is_close(norm_in);
1285
- }
1286
- }
1287
-
1288
- void sr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1289
- {
1290
- mutex_lock(&ir->mutex);
1291
- sr_get_norms_into_internal(ir, field, buf, offset);
1292
- mutex_unlock(&ir->mutex);
1293
- }
1294
-
1295
- static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
1296
- {
1297
- GET_SR;
1298
- Norm *norm = h_get(sr->norms, field);
1299
- if (norm == NULL) { /* not an indexed field */
1300
- return NULL;
1301
- }
1302
-
1303
- if (norm->bytes == NULL) { /* value not yet read */
1304
- uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
1305
- sr_get_norms_into_internal(ir, field, bytes, 0);
1306
- norm->bytes = bytes; /* cache it */
1307
- }
1308
- return norm->bytes;
1309
- }
1310
-
1311
- uchar *sr_get_norms(IndexReader *ir, char *field)
1312
- {
1313
- uchar *norms;
1314
- mutex_lock(&ir->mutex);
1315
- norms = sr_get_norms_internal(ir, field);
1316
- mutex_unlock(&ir->mutex);
1317
- return norms;
1318
- }
1319
-
1320
- static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
1321
- {
1322
- GET_SR;
1323
- uchar *bytes;
1324
- mutex_lock(&ir->mutex);
1325
-
1326
- bytes = sr_get_norms_internal(ir, field);
1327
- if (bytes == NULL) {
1328
- if (sr->fake_norms) {
1329
- bytes = sr->fake_norms;
1330
- } else {
1331
- int len = ir->max_doc(ir);
1332
- sr->fake_norms = bytes = ALLOC_N(uchar, len);
1333
- memset(bytes, 0, len);
1334
- }
1335
- }
1336
- mutex_unlock(&ir->mutex);
1337
- return bytes;
1338
- }
1339
-
1340
- void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1341
- {
1342
- GET_SR;
1343
- Norm *norm;
1344
-
1345
- norm = h_get(sr->norms, field);
1346
- if (norm != NULL) { /* an indexed field */
1347
- norm->is_dirty = true; /* mark it dirty */
1348
- sr->norms_dirty = true;
1349
-
1350
- sr_get_norms_internal(ir, field)[doc_num] = val;
1351
- }
1352
- }
1353
-
1354
- int sr_doc_freq(IndexReader *ir, Term *t)
1355
- {
1356
- GET_SR;
1357
- TermInfo *ti = tir_get_ti(sr->tir, t);
1358
- if (ti != NULL) {
1359
- int df = ti->doc_freq;
1360
- ti_destroy(ti);
1361
- return df;
1362
- } else {
1363
- return 0;
1364
- }
1365
- }
1366
-
1367
- Array *sr_file_names(IndexReader *ir)
1368
- {
1369
- GET_SR;
1370
- Array *file_names = ary_create(0, &free);
1371
- FieldInfo *fi;
1372
- int i;
1373
- char fname[SEGMENT_NAME_MAX_LENGTH];
1374
-
1375
- for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
1376
- sprintf(fname, "%s.%s", sr->segment, INDEX_EXTENSIONS[i]);
1377
- if (ir->store->exists(ir->store, fname))
1378
- ary_append(file_names, estrdup(fname));
1379
- }
1380
-
1381
- for (i = 0; i < sr->fis->fcnt; i++) {
1382
- fi = sr->fis->by_number[i];
1383
- if (fi->is_indexed && !fi->omit_norms) {
1384
- if (sr->cfs_store) {
1385
- sprintf(fname, "%s.s%d", sr->segment, i);
1386
- } else {
1387
- sprintf(fname, "%s.f%d", sr->segment, i);
1388
- }
1389
- if (ir->store->exists(ir->store, fname))
1390
- ary_append(file_names, estrdup(fname));
1391
- }
1392
- }
1393
- return file_names;
1394
- }
1395
-
1396
- HashSet *sr_get_field_names(IndexReader *ir, int field_type)
1397
- {
1398
- GET_SR;
1399
- int i;
1400
- HashSet *field_set = hs_str_create(NULL);
1401
- FieldInfo *fi;
1402
- for (i = 0; i < sr->fis->fcnt; i++) {
1403
- fi = sr->fis->by_number[i];
1404
- switch(field_type) {
1405
- case IR_ALL:
1406
- hs_add(field_set, fi->name);
1407
- break;
1408
- case IR_UNINDEXED:
1409
- if (!fi->is_indexed) hs_add(field_set, fi->name);
1410
- break;
1411
- case IR_INDEXED:
1412
- if (fi->is_indexed) hs_add(field_set, fi->name);
1413
- break;
1414
- case IR_INDEXED_NO_TERM_VECTOR:
1415
- if (fi->is_indexed && !fi->store_tv) hs_add(field_set, fi->name);
1416
- break;
1417
- case IR_TERM_VECTOR:
1418
- if (fi->store_tv && !fi->store_pos && !fi->store_offset)
1419
- hs_add(field_set, fi->name);
1420
- break;
1421
- case IR_INDEXED_WITH_TERM_VECTOR:
1422
- if (fi->is_indexed && fi->store_tv) hs_add(field_set, fi->name);
1423
- break;
1424
- case IR_TERM_VECTOR_WITH_POSITION:
1425
- if (fi->store_pos && !fi->store_offset) hs_add(field_set, fi->name);
1426
- break;
1427
- case IR_TERM_VECTOR_WITH_OFFSET:
1428
- if (!fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1429
- case IR_TERM_VECTOR_WITH_POSITION_OFFSET:
1430
- if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1431
- break;
1432
- default:
1433
- RAISE(ARG_ERROR, INVALID_FIELD_TYPE_MSG);
1434
- }
1435
- }
1436
- return field_set;
1437
- }
1438
-
1439
- int sr_num_docs(IndexReader *ir)
1440
- {
1441
- GET_SR;
1442
- int num_docs;
1443
-
1444
- mutex_lock(&ir->mutex);
1445
- num_docs = sr_max_doc(ir);
1446
- if (sr->deleted_docs != NULL)
1447
- num_docs -= sr->deleted_docs->count;
1448
- mutex_unlock(&ir->mutex);
1449
- return num_docs;
1450
- }
1451
-
1452
- TermDocEnum *sr_term_docs(IndexReader *ir)
1453
- {
1454
- return stde_create(ir);
1455
- }
1456
-
1457
- TermDocEnum *sr_term_positions(IndexReader *ir)
1458
- {
1459
- return stpe_create(ir);
1460
- }
1461
-
1462
- void sr_open_norms(IndexReader *ir, Store *cfs_store)
1463
- {
1464
- GET_SR;
1465
- int i;
1466
- FieldInfo *fi;
1467
- Store *tmp_store;
1468
- char fname[SEGMENT_NAME_MAX_LENGTH];
1469
- for (i = 0; i < sr->fis->fcnt; i++) {
1470
- tmp_store = ir->store;
1471
- fi = sr->fis->by_number[i];
1472
- if (fi->is_indexed && !fi->omit_norms) {
1473
- sprintf(fname, "%s.s%d", sr->segment, fi->number);
1474
- if (! tmp_store->exists(tmp_store, fname)) {
1475
- sprintf(fname, "%s.f%d", sr->segment, fi->number);
1476
- tmp_store = cfs_store;
1477
- }
1478
- h_set(sr->norms, fi->name,
1479
- norm_create(tmp_store->open_input(tmp_store, fname), fi->number));
1480
- }
1481
- }
1482
- sr->norms_dirty = false;
1483
- }
1484
-
1485
- TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1486
- {
1487
- GET_SR;
1488
- FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
1489
- TermVectorsReader *tvr;
1490
-
1491
- if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr))) {
1492
- return NULL;
1493
- }
1494
-
1495
- return tvr_get_field_tv(tvr, doc_num, field);
1496
- }
1497
-
1498
- Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
1499
- {
1500
- GET_SR;
1501
- TermVectorsReader *tvr;
1502
- if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL) {
1503
- return NULL;
1504
- }
1505
-
1506
- return tvr_get_tv(tvr, doc_num);
1507
- }
1508
-
1509
- void sr_commit(IndexReader *ir)
1510
- {
1511
- GET_SR;
1512
- char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1513
- char del_fname[SEGMENT_NAME_MAX_LENGTH];
1514
-
1515
- sprintf(del_fname, "%s.del", sr->segment);
1516
-
1517
- if (sr->deleted_docs_dirty) { /* re-write deleted */
1518
- sprintf(tmp_fname, "%s.tmp", sr->segment);
1519
- bv_write(sr->deleted_docs, ir->store, tmp_fname);
1520
- ir->store->rename(ir->store, tmp_fname, del_fname);
1521
- }
1522
- if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
1523
- ir->store->remove(ir->store, del_fname);
1524
- if (sr->norms_dirty) {/* re-write norms */
1525
- int i;
1526
- FieldInfo *fi;
1527
- for (i = 0; i < sr->fis->fcnt; i++) {
1528
- fi = sr->fis->by_number[i];
1529
- if (fi->is_indexed) {
1530
- norm_rewrite((Norm *)h_get(sr->norms, fi->name), ir->store,
1531
- sr->segment, sr_max_doc(ir), sr->cfs_store);
1532
- }
1533
- }
1534
- }
1535
- sr->deleted_docs_dirty = false;
1536
- sr->norms_dirty = false;
1537
- sr->undelete_all = false;
1538
- }
1539
-
1540
- IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
1541
- {
1542
- Store *store = si->store;
1543
- SegmentReader *sr = ALLOC(SegmentReader);
1544
- char fname[SEGMENT_NAME_MAX_LENGTH];
1545
-
1546
- ir->get_term_vector = &sr_get_term_vector;
1547
- ir->get_term_vectors = &sr_get_term_vectors;
1548
- ir->num_docs = &sr_num_docs;
1549
- ir->max_doc = &sr_max_doc;
1550
- ir->get_doc = &sr_get_doc;
1551
- ir->get_norms_into = &sr_get_norms_into;
1552
- ir->get_norms = &sr_get_norms;
1553
- ir->get_norms_always = &sr_get_norms_always;
1554
- ir->do_set_norm = &sr_set_norm;
1555
- ir->terms = &sr_terms;
1556
- ir->terms_from = &sr_terms_from;
1557
- ir->doc_freq = &sr_doc_freq;
1558
- ir->term_docs = &sr_term_docs;
1559
- ir->term_positions = &sr_term_positions;
1560
- ir->do_delete_doc = &sr_delete_doc;
1561
- ir->is_deleted = &sr_is_deleted;
1562
- ir->has_norms = &sr_has_norms;
1563
- ir->has_deletions = &sr_has_deletions;
1564
- ir->do_undelete_all = &sr_undelete_all;
1565
- ir->get_field_names = &sr_get_field_names;
1566
- ir->do_commit = &sr_commit;
1567
- ir->do_close = &sr_close;
1568
- ir->data = sr;
1569
- sr->segment = estrdup(si->name);
1570
- sr->cfs_store = NULL;
1571
- sr->fake_norms = NULL;
1572
- sprintf(fname, "%s.cfs", sr->segment);
1573
- if (store->exists(store, fname)) {
1574
- sr->cfs_store = open_cmpd_store(store, fname);
1575
- store = sr->cfs_store;
1576
- }
1577
-
1578
- sprintf(fname, "%s.fnm", sr->segment);
1579
-
1580
- sr->fis = fis_open(store, fname);
1581
- sr->fr = fr_open(store, sr->segment, sr->fis);
1582
-
1583
- sr->tir = tir_open(store, sr->segment, sr->fis);
1584
- sr->deleted_docs = NULL;
1585
- sr->deleted_docs_dirty = false;
1586
- sr->undelete_all = false;
1587
- if (si_has_deletions(si)) {
1588
- sprintf(fname, "%s.del", sr->segment);
1589
- sr->deleted_docs = bv_read(si->store, fname);
1590
- }
1591
-
1592
- sprintf(fname, "%s.frq", sr->segment);
1593
- sr->freq_in = store->open_input(store, fname);
1594
- sprintf(fname, "%s.prx", sr->segment);
1595
- sr->prox_in = store->open_input(store, fname);
1596
- sr->norms = h_new_str((free_ft)NULL, (free_ft)&norm_destroy);
1597
- sr_open_norms(ir, store);
1598
-
1599
- if (fis_has_vectors(sr->fis)) {
1600
- sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
1601
- thread_key_create(&sr->thread_tvr, NULL);
1602
- sr->tvr_bucket = ary_create(1, (free_ft)&tvr_close);
1603
- } else {
1604
- sr->orig_tvr = NULL;
1605
- }
1606
- return ir;
1607
- }
1608
-
1609
- IndexReader *sr_open_si(SegmentInfo *si)
1610
- {
1611
- IndexReader *ir = ir_create(si->store, NULL, false);
1612
- ref(si->store);
1613
- return sr_open_internal(ir, si);
1614
- }
1615
-
1616
- IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner)
1617
- {
1618
- SegmentInfo *si = sis->segs[si_num];
1619
- IndexReader *ir = ir_create(si->store, sis, is_owner);
1620
- ref(si->store);
1621
- return sr_open_internal(ir, si);
1622
- }
1623
-
1624
- /****************************************************************************
1625
- *
1626
- * MultiReader
1627
- *
1628
- ****************************************************************************/
1629
-
1630
- #define GET_MR MultiReader *mr = (MultiReader *)ir->data
1631
- #define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
1632
- int i = mr_reader_index(mr, doc_num);\
1633
- IndexReader *reader = mr->sub_readers[i]
1634
-
1635
-
1636
-
1637
- int mr_reader_index(MultiReader *mr, int doc_num)
1638
- {
1639
- int lo = 0; /* search @starts array */
1640
- int hi = mr->rcnt - 1; /* for first element less */
1641
- int mid;
1642
- int mid_value;
1643
-
1644
- while (hi >= lo) {
1645
- mid = (lo + hi) >> 1;
1646
- mid_value = mr->starts[mid];
1647
- if (doc_num < mid_value) {
1648
- hi = mid - 1;
1649
- } else if (doc_num > mid_value) {
1650
- lo = mid + 1;
1651
- } else { /* found a match */
1652
- while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
1653
- mid += 1; /* scan to last match in case we have empty segments */
1654
- return mid;
1655
- }
1656
- }
1657
- return hi;
1658
- }
1659
-
1660
- TermVector *mr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1661
- {
1662
- GET_READER(doc_num);
1663
- return reader->get_term_vector(reader, doc_num - mr->starts[i], field);
1664
- }
1665
-
1666
- Array *mr_get_term_vectors(IndexReader *ir, int doc_num)
1667
- {
1668
- GET_READER(doc_num);
1669
- return reader->get_term_vectors(reader, doc_num - mr->starts[i]);
1670
- }
1671
-
1672
- int mr_num_docs(IndexReader *ir)
1673
- {
1674
- int i, num_docs;
1675
- GET_MR;
1676
- mutex_lock(&ir->mutex);
1677
- if (mr->num_docs_cache == -1) {
1678
- IndexReader *reader;
1679
- mr->num_docs_cache = 0;
1680
- for (i = 0; i < mr->rcnt; i++) {
1681
- reader = mr->sub_readers[i];
1682
- mr->num_docs_cache += reader->num_docs(reader);
1683
- }
1684
- }
1685
- num_docs = mr->num_docs_cache;
1686
- mutex_unlock(&ir->mutex);
1687
-
1688
- return num_docs;
1689
- }
1690
-
1691
- int mr_max_doc(IndexReader *ir)
1692
- {
1693
- GET_MR;
1694
- return mr->max_doc;
1695
- }
1696
-
1697
- Document *mr_get_doc(IndexReader *ir, int doc_num)
1698
- {
1699
- GET_READER(doc_num);
1700
- return reader->get_doc(reader, doc_num - mr->starts[i]);
1701
- }
1702
-
1703
- void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1704
- {
1705
- int i;
1706
- uchar *bytes;
1707
- GET_MR;
1708
-
1709
- mutex_lock(&ir->mutex);
1710
- bytes = h_get(mr->norms_cache, field);
1711
- if (bytes != NULL) {
1712
- memcpy(buf + offset, bytes, mr->max_doc);
1713
- } else {
1714
- IndexReader *reader;
1715
- for (i = 0; i < mr->rcnt; i++) {
1716
- reader = mr->sub_readers[i];
1717
- reader->get_norms_into(reader, field, buf, offset + mr->starts[i]);
1718
- }
1719
- }
1720
- mutex_unlock(&ir->mutex);
1721
- }
1722
-
1723
- uchar *mr_get_norms(IndexReader *ir, char *field)
1724
- {
1725
- int i;
1726
- GET_MR;
1727
- uchar *bytes;
1728
- IndexReader *reader;
1729
-
1730
- mutex_lock(&ir->mutex);
1731
- bytes = h_get(mr->norms_cache, field);
1732
- if (bytes == NULL) {
1733
- bytes = ALLOC_N(uchar, mr->max_doc);
1734
-
1735
- for (i = 0; i < mr->rcnt; i++) {
1736
- reader = mr->sub_readers[i];
1737
- reader->get_norms_into(reader, field, bytes, mr->starts[i]);
1738
- }
1739
- h_set(mr->norms_cache, field, bytes); /* update cache */
1740
- }
1741
- mutex_unlock(&ir->mutex);
1742
-
1743
- return bytes;
1744
- }
1745
-
1746
- void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1747
- {
1748
- GET_READER(doc_num);
1749
- h_del(mr->norms_cache, field); /* clear cache */
1750
- ir_set_norm(reader, doc_num - mr->starts[i], field, val);
1751
- }
1752
-
1753
- TermEnum *mr_terms(IndexReader *ir)
1754
- {
1755
- GET_MR;
1756
- return mte_create(mr->sub_readers, mr->starts, mr->rcnt, NULL);
1757
- }
1758
-
1759
- TermEnum *mr_terms_from(IndexReader *ir, Term *term)
1760
- {
1761
- GET_MR;
1762
- return mte_create(mr->sub_readers, mr->starts, mr->rcnt, term);
1763
- }
1764
-
1765
- int mr_doc_freq(IndexReader *ir, Term *t)
1766
- {
1767
- int total = 0, i; /* sum freqs in segments */
1768
- GET_MR;
1769
-
1770
- IndexReader *reader;
1771
- for (i = 0; i < mr->rcnt; i++) {
1772
- reader = mr->sub_readers[i];
1773
- total += reader->doc_freq(reader, t);
1774
- }
1775
- return total;
1776
- }
1777
-
1778
- TermDocEnum *mr_term_docs(IndexReader *ir)
1779
- {
1780
- GET_MR;
1781
- return mtde_create(mr->sub_readers, mr->starts, mr->rcnt);
1782
- }
1783
-
1784
- TermDocEnum *mr_term_positions(IndexReader *ir)
1785
- {
1786
- GET_MR;
1787
- return mtpe_create(mr->sub_readers, mr->starts, mr->rcnt);
1788
- }
1789
-
1790
- void mr_delete_doc(IndexReader *ir, int doc_num)
1791
- {
1792
- GET_READER(doc_num);
1793
- mr->num_docs_cache = -1; /* invalidate cache */
1794
-
1795
- /* dispatch to segment reader */
1796
- reader->do_delete_doc(reader, doc_num - mr->starts[i]);
1797
- mr->has_deletions = true;
1798
- }
1799
-
1800
- bool mr_is_deleted(IndexReader *ir, int doc_num)
1801
- {
1802
- GET_READER(doc_num);
1803
- return reader->is_deleted(reader, doc_num - mr->starts[i]);
1804
- }
1805
-
1806
- bool mr_has_norms(IndexReader *ir, char *field)
1807
- {
1808
- bool has_norms = false;
1809
- int i;
1810
- GET_MR;
1811
-
1812
- IndexReader *reader;
1813
- for (i = 0; i < mr->rcnt; i++) {
1814
- reader = mr->sub_readers[i];
1815
- if (reader->has_norms(reader, field)) {
1816
- has_norms = true;
1817
- break;
1818
- }
1819
- }
1820
-
1821
- return has_norms;
1822
- }
1823
-
1824
- bool mr_has_deletions(IndexReader *ir)
1825
- {
1826
- GET_MR;
1827
- return mr->has_deletions;
1828
- }
1829
-
1830
- void mr_undelete_all(IndexReader *ir)
1831
- {
1832
- int i;
1833
- GET_MR;
1834
- IndexReader *reader;
1835
-
1836
- mr->num_docs_cache = -1; /* invalidate cache */
1837
- for (i = 0; i < mr->rcnt; i++) {
1838
- reader = mr->sub_readers[i];
1839
- reader->do_undelete_all(reader);
1840
- }
1841
- mr->has_deletions = false;
1842
- }
1843
-
1844
- HashSet *mr_get_field_names(IndexReader *ir, int field_type)
1845
- {
1846
- int i;
1847
- GET_MR;
1848
- HashSet *field_set = hs_str_create(NULL);
1849
- IndexReader *reader;
1850
- for (i = 0; i < mr->rcnt; i++) {
1851
- reader = mr->sub_readers[i];
1852
- hs_merge(field_set, reader->get_field_names(reader, field_type));
1853
- }
1854
- return field_set;
1855
- }
1856
-
1857
- void mr_commit(IndexReader *ir)
1858
- {
1859
- GET_MR;
1860
- int i;
1861
- IndexReader *reader;
1862
- for (i = 0; i < mr->rcnt; i++) {
1863
- reader = mr->sub_readers[i];
1864
- reader->do_commit(reader);
1865
- }
1866
- }
1867
-
1868
- void mr_close(IndexReader *ir)
1869
- {
1870
- GET_MR;
1871
- int i;
1872
- IndexReader *reader;
1873
- for (i = 0; i < mr->rcnt; i++) {
1874
- reader = mr->sub_readers[i];
1875
- ir_close(reader);
1876
- }
1877
- free(mr->sub_readers);
1878
- h_destroy(mr->norms_cache);
1879
- free(mr->starts);
1880
- free(mr);
1881
- }
1882
-
1883
- IndexReader *mr_open(Store *store,
1884
- SegmentInfos *sis,
1885
- IndexReader **sub_readers,
1886
- int rcnt)
1887
- {
1888
- int i;
1889
- MultiReader *mr = ALLOC(MultiReader);
1890
- IndexReader *sub_reader;
1891
- IndexReader *ir;
1892
- mr->sub_readers = sub_readers;
1893
- mr->rcnt = rcnt;
1894
-
1895
- mr->max_doc = 0;
1896
- mr->num_docs_cache = -1;
1897
- mr->has_deletions = false;
1898
-
1899
- mr->starts = ALLOC_N(int, (rcnt+1));
1900
- for (i = 0; i < rcnt; i++) {
1901
- sub_reader = sub_readers[i];
1902
- mr->starts[i] = mr->max_doc;
1903
- mr->max_doc += sub_reader->max_doc(sub_reader); /* compute max_docs */
1904
-
1905
- if (sub_reader->has_deletions(sub_reader)) {
1906
- mr->has_deletions = true;
1907
- }
1908
- }
1909
- mr->starts[rcnt] = mr->max_doc;
1910
- mr->norms_cache = h_new_str(NULL, &free);
1911
-
1912
- ir = ir_create(store, sis, true);
1913
- ir->get_term_vector = &mr_get_term_vector;
1914
- ir->get_term_vectors = &mr_get_term_vectors;
1915
- ir->num_docs = &mr_num_docs;
1916
- ir->max_doc = &mr_max_doc;
1917
- ir->get_doc = &mr_get_doc;
1918
- ir->get_norms_into = &mr_get_norms_into;
1919
- ir->get_norms = &mr_get_norms;
1920
- ir->get_norms_always = &mr_get_norms;
1921
- ir->do_set_norm = &mr_set_norm;
1922
- ir->terms = &mr_terms;
1923
- ir->terms_from = &mr_terms_from;
1924
- ir->doc_freq = &mr_doc_freq;
1925
- ir->term_docs = &mr_term_docs;
1926
- ir->term_positions = &mr_term_positions;
1927
- ir->do_delete_doc = &mr_delete_doc;
1928
- ir->is_deleted = &mr_is_deleted;
1929
- ir->has_norms = &mr_has_norms;
1930
- ir->has_deletions = &mr_has_deletions;
1931
- ir->do_undelete_all = &mr_undelete_all;
1932
- ir->get_field_names = &mr_get_field_names;
1933
- ir->do_commit = &mr_commit;
1934
- ir->do_close = &mr_close;
1935
- ir->data = mr;
1936
-
1937
- return ir;
1938
- }
1939
-
1940
- /****************************************************************************
1941
- *
1942
- * SegmentMergeInfo
1943
- *
1944
- ****************************************************************************/
1945
-
1946
- bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2)
1947
- {
1948
- int cmpres = tb_cmp(smi1->tb, smi2->tb);
1949
- if (cmpres == 0) {
1950
- return smi1->base < smi2->base;
1951
- } else {
1952
- return cmpres < 0;
1953
- }
1954
- }
1955
-
1956
- int *smi_load_doc_map(SegmentMergeInfo *smi)
1957
- {
1958
- IndexReader *ir = smi->ir;
1959
- if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
1960
- int max_doc = ir->max_doc(ir);
1961
- int j = 0, i;
1962
-
1963
- smi->doc_map = ALLOC_N(int, max_doc);
1964
- for (i = 0; i < max_doc; i++) {
1965
- if (ir->is_deleted(ir, i)) {
1966
- smi->doc_map[i] = -1;
1967
- } else {
1968
- smi->doc_map[i] = j++;
1969
- }
1970
- }
1971
- }
1972
- return smi->doc_map;
1973
- }
1974
-
1975
- SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
1976
- {
1977
- SegmentMergeInfo *smi = ALLOC(SegmentMergeInfo);
1978
- smi->base = base;
1979
- smi->ir = ir;
1980
- smi->te = te;
1981
- smi->tb = te->tb_curr;
1982
- smi->postings = ir->term_positions(ir);
1983
- smi->doc_map = NULL;
1984
- return smi;
1985
- }
1986
-
1987
- void smi_destroy(SegmentMergeInfo *smi)
1988
- {
1989
- smi->postings->close(smi->postings);
1990
- smi->te->close(smi->te);
1991
- if (smi->doc_map != NULL)
1992
- free(smi->doc_map);
1993
- free(smi);
1994
- }
1995
-
1996
- TermBuffer *smi_next(SegmentMergeInfo *smi)
1997
- {
1998
- return (smi->tb = smi->te->next(smi->te));
1999
- }
2000
-
2001
- /****************************************************************************
2002
- *
2003
- * SegmentMerger
2004
- *
2005
- ****************************************************************************/
2006
-
2007
- SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
2008
- {
2009
- SegmentMerger *sm = ALLOC(SegmentMerger);
2010
- sm->store = store;
2011
- sm->name = estrdup(name);
2012
- sm->readers = ary_create(config.merge_factor, (free_ft)&ir_close);
2013
- sm->fis = NULL;
2014
- sm->freq_out = NULL;
2015
- sm->prox_out = NULL;
2016
- sm->tiw = NULL;
2017
- sm->queue = NULL;
2018
- sm->ti = ti_create(0, 0, 0, 0);
2019
- sm->term_index_interval = term_index_interval;
2020
- sm->skip_buffer = ram_create_buffer();
2021
- sm->skip_interval = -1;
2022
- return sm;
2023
- }
2024
-
2025
- void sm_close(SegmentMerger *sm)
2026
- {
2027
- int i;
2028
- if (sm->freq_out != NULL) os_close(sm->freq_out);
2029
- if (sm->prox_out != NULL) os_close(sm->prox_out);
2030
- if (sm->tiw != NULL) {
2031
- for (i = 0; i < sm->terms_buf_size; i++) {
2032
- free(sm->terms_buf[i].text);
2033
- }
2034
- free(sm->terms_buf);
2035
- tiw_close(sm->tiw);
2036
- }
2037
- if (sm->queue != NULL) pq_destroy(sm->queue);
2038
- sm->freq_out = NULL;
2039
- sm->prox_out = NULL;
2040
- sm->tiw = NULL;
2041
- sm->queue = NULL;
2042
- }
2043
-
2044
- void sm_destroy(SegmentMerger *sm)
2045
- {
2046
- if (sm->fis != NULL) fis_destroy(sm->fis);
2047
- ary_destroy(sm->readers);
2048
- sm_close(sm);
2049
- free(sm->name);
2050
- ti_destroy(sm->ti);
2051
- ram_destroy_buffer(sm->skip_buffer);
2052
- free(sm);
2053
- }
2054
-
2055
- void sm_add(SegmentMerger *sm, IndexReader *ir)
2056
- {
2057
- ary_append(sm->readers, ir);
2058
- }
2059
-
2060
- static inline void sm_add_indexed(IndexReader *ir,
2061
- FieldInfos *fis,
2062
- HashSet *fields,
2063
- bool store_tv,
2064
- bool store_pos,
2065
- bool store_offset)
2066
- {
2067
- int i;
2068
- char *field;
2069
- for (i = 0; i < fields->size; i++) {
2070
- field = (char *)fields->elems[i];
2071
- fis_add(fis, field, true, store_tv, store_pos, store_offset,
2072
- !ir->has_norms(ir, field));
2073
- }
2074
- hs_destroy(fields);
2075
- }
2076
-
2077
- int sm_merge_fields(SegmentMerger *sm)
2078
- {
2079
- int i, j, maxdoc;
2080
- FieldInfos *fis = sm->fis = fis_create();
2081
- int doc_count = 0;
2082
- Document *doc;
2083
- FieldsWriter *fw;
2084
-
2085
- for (i = 0; i < sm->readers->size; i++) {
2086
- IndexReader *ir = sm->readers->elems[i];
2087
-
2088
- sm_add_indexed(ir, fis,
2089
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION_OFFSET),
2090
- true, true, true);
2091
- sm_add_indexed(ir, fis,
2092
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION),
2093
- true, true, false);
2094
- sm_add_indexed(ir, fis,
2095
- ir->get_field_names(ir, IR_TERM_VECTOR_WITH_OFFSET),
2096
- true, false, true);
2097
- sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_TERM_VECTOR),
2098
- true, false, false);
2099
- sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_INDEXED),
2100
- false, false, false);
2101
- fis_add_fields(fis, ir->get_field_names(ir, IR_UNINDEXED),
2102
- false, false, false, false, false);
2103
- }
2104
- fis_write(fis, sm->store, sm->name, ".fnm");
2105
-
2106
- /* merge field values */
2107
- fw = fw_open(sm->store, sm->name, fis);
2108
-
2109
- TRY
2110
- for (i = 0; i < sm->readers->size; i++) {
2111
- IndexReader *ir = sm->readers->elems[i];
2112
- maxdoc = ir->max_doc(ir);
2113
- for (j = 0; j < maxdoc; j++) {
2114
- if (!ir->is_deleted(ir, j)) { /* skip deleted docs */
2115
- doc = ir->get_doc(ir, j);
2116
- fw_add_doc(fw, doc);
2117
- doc_destroy(doc);
2118
- doc_count++;
2119
- }
2120
- }
2121
- }
2122
- XFINALLY
2123
- fw_close(fw);
2124
- XENDTRY
2125
- return doc_count;
2126
- }
2127
-
2128
- void sm_reset_skip(SegmentMerger *sm)
2129
- {
2130
- ramo_reset(sm->skip_buffer);
2131
- sm->last_skip_doc = 0;
2132
- sm->last_skip_freq_pointer = os_pos(sm->freq_out);
2133
- sm->last_skip_prox_pointer = os_pos(sm->prox_out);
2134
- }
2135
-
2136
- inline void sm_buffer_skip(SegmentMerger *sm, int doc)
2137
- {
2138
- int freq_pointer = os_pos(sm->freq_out);
2139
- int prox_pointer = os_pos(sm->prox_out);
2140
-
2141
- os_write_vint(sm->skip_buffer, doc - sm->last_skip_doc);
2142
- os_write_vint(sm->skip_buffer, freq_pointer - sm->last_skip_freq_pointer);
2143
- os_write_vint(sm->skip_buffer, prox_pointer - sm->last_skip_prox_pointer);
2144
-
2145
- sm->last_skip_doc = doc;
2146
- sm->last_skip_freq_pointer = freq_pointer;
2147
- sm->last_skip_prox_pointer = prox_pointer;
2148
- }
2149
-
2150
- int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2151
- {
2152
- int i, j;
2153
- int last_doc = 0, base, doc, doc_code, freq, last_position, position;
2154
- int *doc_map = NULL;
2155
- int df = 0; /* number of docs w/ term */
2156
- TermDocEnum *postings;
2157
- SegmentMergeInfo *smi;
2158
- sm_reset_skip(sm);
2159
- for (i = 0; i < cnt; i++) {
2160
- smi = smis[i];
2161
- postings = smi->postings;
2162
- base = smi->base;
2163
- doc_map = smi_load_doc_map(smi);
2164
-
2165
- stde_seek_ti(postings, smi->te->ti_curr);
2166
- while (postings->next(postings)) {
2167
- doc = postings->doc_num(postings);
2168
- if (doc_map != NULL) {
2169
- doc = doc_map[doc]; /* work around deletions */
2170
- }
2171
- doc += base; /* convert to merged space */
2172
-
2173
- if (doc < last_doc) {
2174
- RAISE(STATE_ERROR, DOC_ORDER_ERROR_MSG);
2175
- }
2176
-
2177
- df++;
2178
-
2179
- if ((df % sm->skip_interval) == 0) {
2180
- sm_buffer_skip(sm, last_doc);
2181
- }
2182
-
2183
- doc_code = (doc - last_doc) << 1; /* use low bit to flag freq=1 */
2184
- last_doc = doc;
2185
-
2186
- freq = postings->freq(postings);
2187
- if (freq == 1) {
2188
- os_write_vint(sm->freq_out, doc_code | 1); /* write doc & freq=1 */
2189
- } else {
2190
- os_write_vint(sm->freq_out, doc_code); /* write doc */
2191
- os_write_vint(sm->freq_out, freq); /* write freqency in doc */
2192
- }
2193
-
2194
-
2195
- last_position = 0; /* write position deltas */
2196
- for (j = 0; j < freq; j++) {
2197
- position = postings->next_position(postings);
2198
- os_write_vint(sm->prox_out, position - last_position);
2199
- last_position = position;
2200
- }
2201
- }
2202
- }
2203
- return df;
2204
- }
2205
-
2206
- int sm_write_skip(SegmentMerger *sm)
2207
- {
2208
- int skip_pointer = os_pos(sm->freq_out);
2209
- ramo_write_to(sm->skip_buffer, sm->freq_out);
2210
- return skip_pointer;
2211
- }
2212
-
2213
- Term *sm_tb_to_term(SegmentMerger *sm, TermBuffer *tb)
2214
- {
2215
- int index = sm->terms_buf_pointer % sm->terms_buf_size;
2216
- sm->terms_buf_pointer++;
2217
- sm->terms_buf[index].field = tb->field;
2218
- strcpy(sm->terms_buf[index].text, tb->text);
2219
- return &(sm->terms_buf[index]);
2220
- }
2221
-
2222
- void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2223
- {
2224
- int freq_pointer = os_pos(sm->freq_out);
2225
- int prox_pointer = os_pos(sm->prox_out);
2226
-
2227
- int df = sm_append_postings(sm, smis, cnt); /* append posting data */
2228
-
2229
- int skip_pointer = sm_write_skip(sm);
2230
-
2231
- if (df > 0) {
2232
- /* add an entry to the dictionary with pointers to prox and freq files */
2233
- ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
2234
- tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
2235
- }
2236
- }
2237
-
2238
- void sm_merge_term_infos(SegmentMerger *sm)
2239
- {
2240
- int base = 0;
2241
- int i, match_size;
2242
- IndexReader *ir;
2243
- TermEnum *te;
2244
- SegmentMergeInfo *smi, *top, **match;
2245
- TermBuffer *tb;
2246
-
2247
- for (i = 0; i < sm->readers->size; i++) {
2248
- ir = sm->readers->elems[i];
2249
- te = ir->terms(ir);
2250
- smi = smi_create(base, te, ir);
2251
- base += ir->num_docs(ir);
2252
- if (smi_next(smi) != NULL) {
2253
- pq_push(sm->queue, smi); /* initialize @queue */
2254
- } else {
2255
- smi_destroy(smi);
2256
- }
2257
- }
2258
-
2259
- match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
2260
-
2261
- while (sm->queue->count > 0) {
2262
- /*
2263
- for (i = 1; i <= sm->queue->count; i++) {
2264
- printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
2265
- ((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
2266
- }printf("\n\n");
2267
- */
2268
- match_size = 0; /* pop matching terms */
2269
- match[match_size] = pq_pop(sm->queue);
2270
- match_size++;
2271
- tb = match[0]->tb;
2272
- top = pq_top(sm->queue);
2273
- while ((top != NULL) && (tb_cmp(tb, top->tb) == 0)) {
2274
- match[match_size] = pq_pop(sm->queue);
2275
- match_size++;
2276
- top = pq_top(sm->queue);
2277
- }
2278
-
2279
- /* printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text); */
2280
- sm_merge_term_info(sm, match, match_size); /* add new TermInfo */
2281
-
2282
- while (match_size > 0) {
2283
- match_size--;
2284
- smi = match[match_size];
2285
- if (smi_next(smi) != NULL) {
2286
- pq_push(sm->queue, smi); /* restore queue */
2287
- } else {
2288
- smi_destroy(smi); /* done with a segment */
2289
- }
2290
- }
2291
- }
2292
- free(match);
2293
- }
2294
-
2295
- void sm_merge_terms(SegmentMerger *sm)
2296
- {
2297
- int i;
2298
- char fname[SEGMENT_NAME_MAX_LENGTH];
2299
-
2300
- TRY
2301
- sprintf(fname, "%s.frq", sm->name);
2302
- sm->freq_out = sm->store->create_output(sm->store, fname);
2303
- sprintf(fname, "%s.prx", sm->name);
2304
- sm->prox_out = sm->store->create_output(sm->store, fname);
2305
- sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
2306
- /* terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
2307
- * to keep the last index_interval terms so that it can compare the last term
2308
- * put in the index with the next one. So the size of the buffer must by
2309
- * index_interval + 2. */
2310
- sm->terms_buf_pointer = 0;
2311
- sm->terms_buf_size = sm->tiw->index_interval + 2;
2312
- sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
2313
- for (i = 0; i < sm->terms_buf_size; i++) {
2314
- sm->terms_buf[i].field = NULL;
2315
- sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
2316
- }
2317
- sm->skip_interval = sm->tiw->skip_interval;
2318
- sm->queue = pq_create(sm->readers->size, (lt_ft)&smi_lt);
2319
-
2320
- sm_merge_term_infos(sm);
2321
-
2322
- XFINALLY
2323
- sm_close(sm);
2324
- XENDTRY
2325
- }
2326
-
2327
- void sm_merge_norms(SegmentMerger *sm)
2328
- {
2329
- int i, j, k, max_doc;
2330
- uchar *norm_buf;
2331
- FieldInfo *fi;
2332
- OutStream *os;
2333
- char fname[SEGMENT_NAME_MAX_LENGTH];
2334
- IndexReader *ir;
2335
- for (i = 0; i < sm->fis->fcnt; i++) {
2336
- fi = sm->fis->by_number[i];
2337
- if (fi->is_indexed && !fi->omit_norms) {
2338
- sprintf(fname, "%s.f%d", sm->name, i);
2339
- os = sm->store->create_output(sm->store, fname);
2340
- TRY
2341
- for (j = 0; j < sm->readers->size; j++) {
2342
- ir = sm->readers->elems[j];
2343
- max_doc = ir->max_doc(ir);
2344
- norm_buf = ALLOC_N(uchar, max_doc);
2345
- memset(norm_buf, 0, sizeof(uchar) * max_doc);
2346
- ir->get_norms_into(ir, fi->name, norm_buf, 0);
2347
- for (k = 0; k < max_doc; k++) {
2348
- if (!ir->is_deleted(ir, k)) {
2349
- os_write_byte(os, norm_buf[k]);
2350
- }
2351
- }
2352
- free(norm_buf);
2353
- }
2354
- XFINALLY
2355
- os_close(os);
2356
- XENDTRY
2357
- }
2358
- }
2359
- }
2360
-
2361
- void sm_merge_vectors(SegmentMerger *sm)
2362
- {
2363
- int i, j, max_doc;
2364
- TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
2365
- IndexReader *ir;
2366
- Array *tvs;
2367
- TRY
2368
- for (i = 0; i < sm->readers->size; i++) {
2369
- ir = sm->readers->elems[i];
2370
- max_doc = ir->max_doc(ir);
2371
- for (j = 0; j < max_doc; j++) {
2372
- /* skip deleted docs */
2373
- if (! ir->is_deleted(ir, j)) {
2374
- tvs = ir->get_term_vectors(ir, j);
2375
- if (tvs) {
2376
- tvw_add_all_doc_vectors(tvw, tvs);
2377
- ary_destroy(tvs);
2378
- }
2379
- }
2380
- }
2381
- }
2382
- XFINALLY
2383
- tvw_close(tvw);
2384
- XENDTRY
2385
- }
2386
-
2387
- int sm_merge(SegmentMerger *sm)
2388
- {
2389
- int doc_count = sm_merge_fields(sm);
2390
- sm_merge_terms(sm);
2391
- sm_merge_norms(sm);
2392
- if (fis_has_vectors(sm->fis))
2393
- sm_merge_vectors(sm);
2394
- return doc_count;
2395
- }
2396
-
2397
- Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
2398
- {
2399
- Array *files = ary_create(0, &free);
2400
- CompoundWriter *cw = open_cw(sm->store, file_name);
2401
- FieldInfo *fi;
2402
- char fname[SEGMENT_NAME_MAX_LENGTH];
2403
-
2404
- int i;
2405
- for (i = 0; i < NELEMS(COMPOUND_EXTENSIONS); i++) {
2406
- sprintf(fname, "%s.%s", sm->name, COMPOUND_EXTENSIONS[i]);
2407
- ary_append(files, estrdup(fname));
2408
- }
2409
-
2410
- /* Field norm files */
2411
- for (i = 0; i < sm->fis->fcnt; i++) {
2412
- fi = sm->fis->by_number[i];
2413
- if (fi->is_indexed && !fi->omit_norms) {
2414
- sprintf(fname, "%s.f%d", sm->name, i);
2415
- ary_append(files, estrdup(fname));
2416
- }
2417
- }
2418
-
2419
- /* Vector files */
2420
- if (fis_has_vectors(sm->fis)) {
2421
- for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
2422
- sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
2423
- ary_append(files, estrdup(fname));
2424
- }
2425
- }
2426
-
2427
- /* Now merge all added files */
2428
- for (i = 0; i < files->size; i++) {
2429
- cw_add_file(cw, (char *)files->elems[i]);
2430
- }
2431
-
2432
- /* Perform the merge */
2433
- cw_close(cw);
2434
-
2435
- return files;
2436
- }
2437
-
2438
- /****************************************************************************
2439
- *
2440
- * IndexReader
2441
- *
2442
- ****************************************************************************/
2443
-
2444
- void ir_acquire_not_necessary(IndexReader *ir) {}
2445
- void ir_acquire_write_lock(IndexReader *ir)
2446
- {
2447
- if (ir->is_stale)
2448
- RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
2449
-
2450
- if (ir->write_lock == NULL) {
2451
- ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
2452
- if (!ir->write_lock->obtain(ir->write_lock)) /* obtain write lock */
2453
- RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
2454
-
2455
- /* we have to check whether index has changed since this reader was opened.
2456
- * if so, this reader is no longer valid for deletion */
2457
- if (sis_read_current_version(ir->store) > ir->sis->version) {
2458
- ir->is_stale = true;
2459
- ir->write_lock->release(ir->write_lock);
2460
- ir->store->close_lock(ir->write_lock);
2461
- ir->write_lock = NULL;
2462
- RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
2463
- }
2464
- }
2465
- }
2466
-
2467
- IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner)
2468
- {
2469
- IndexReader *ir = ALLOC(IndexReader);
2470
-
2471
- mutex_init(&ir->mutex, NULL);
2472
- ir->is_owner = is_owner;
2473
- if (is_owner) {
2474
- ir->acquire_write_lock = &ir_acquire_write_lock;
2475
- } else {
2476
- ir->acquire_write_lock = &ir_acquire_not_necessary;
2477
- }
2478
-
2479
- ir->store = store;
2480
- ir->sis = sis;
2481
- ir->has_changes = false;
2482
- ir->is_stale = false;
2483
- ir->write_lock = NULL;
2484
- ir->cache = NULL;
2485
- ir->sort_cache = NULL;
2486
- return ir;
2487
- }
2488
-
2489
- /**
2490
- * Will keep a reference to the store. To let this method delete the store
2491
- * make sure you deref the store that you pass to it
2492
- */
2493
- IndexReader *ir_open(Store *store)
2494
- {
2495
- int i;
2496
- IndexReader *ir;
2497
- SegmentInfos *sis;
2498
-
2499
- mutex_lock(&store->mutex);
2500
- sis = sis_create();
2501
- sis_read(sis, store);
2502
- if (sis->scnt == 1) {
2503
- ir = sr_open(sis, 0, true);
2504
- } else {
2505
- IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
2506
- for (i = 0; i < sis->scnt; i++) {
2507
- readers[i] = sr_open(sis, i, false);
2508
- }
2509
- ref(store);
2510
- ir = mr_open(store, sis, readers, sis->scnt);
2511
- }
2512
- mutex_unlock(&store->mutex);
2513
- return ir;
2514
- }
2515
-
2516
- bool ir_index_exists(Store *store)
2517
- {
2518
- return store->exists(store, "segments");
2519
- }
2520
-
2521
- void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
2522
- {
2523
- mutex_lock(&ir->mutex);
2524
- ir->acquire_write_lock(ir);
2525
- ir->do_set_norm(ir, doc_num, field, val);
2526
- ir->has_changes = true;
2527
- mutex_unlock(&ir->mutex);
2528
- }
2529
-
2530
- void ir_undelete_all(IndexReader *ir)
2531
- {
2532
- mutex_lock(&ir->mutex);
2533
- ir->acquire_write_lock(ir);
2534
- ir->do_undelete_all(ir);
2535
- ir->has_changes = true;
2536
- mutex_unlock(&ir->mutex);
2537
- }
2538
-
2539
- void ir_delete_doc(IndexReader *ir, int doc_num)
2540
- {
2541
- mutex_lock(&ir->mutex);
2542
- ir->acquire_write_lock(ir);
2543
- ir->do_delete_doc(ir, doc_num);
2544
- ir->has_changes = true;
2545
- mutex_unlock(&ir->mutex);
2546
- }
2547
-
2548
- Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
2549
- {
2550
- TermDocEnum *tde = ir_term_docs_for(ir, term);
2551
- Document *doc = NULL;
2552
-
2553
- if (!tde) return NULL;
2554
-
2555
- if (tde->next(tde)) {
2556
- doc = ir->get_doc(ir, tde->doc_num(tde));
2557
- }
2558
- tde->close(tde);
2559
- return doc;
2560
- }
2561
-
2562
- TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term)
2563
- {
2564
- TermDocEnum *tde = ir->term_docs(ir);
2565
- tde->seek(tde, term);
2566
- return tde;
2567
- }
2568
-
2569
- TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term)
2570
- {
2571
- TermDocEnum *tde = ir->term_positions(ir);
2572
- tde->seek(tde, term);
2573
- return tde;
2574
- }
2575
-
2576
- void ir_commit_internal(IndexReader *ir)
2577
- {
2578
- if (ir->has_changes) {
2579
- if (ir->is_owner) {
2580
- Lock *commit_lock;
2581
-
2582
- mutex_lock(&ir->store->mutex);
2583
- commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2584
- if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
2585
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
2586
- }
2587
-
2588
- ir->do_commit(ir);
2589
- sis_write(ir->sis, ir->store);
2590
-
2591
- commit_lock->release(commit_lock);
2592
- ir->store->close_lock(commit_lock);
2593
- mutex_unlock(&ir->store->mutex);
2594
-
2595
- if (ir->write_lock != NULL) {
2596
- ir->write_lock->release(ir->write_lock); /* release write lock */
2597
- ir->store->close_lock(ir->write_lock);
2598
- ir->write_lock = NULL;
2599
- }
2600
- } else {
2601
- ir->do_commit(ir);
2602
- }
2603
- ir->has_changes = false;
2604
- }
2605
- }
2606
-
2607
- void ir_commit(IndexReader *ir)
2608
- {
2609
- mutex_lock(&ir->mutex);
2610
- ir_commit_internal(ir);
2611
- mutex_unlock(&ir->mutex);
2612
- }
2613
-
2614
- void ir_close(IndexReader *ir)
2615
- {
2616
- mutex_lock(&ir->mutex);
2617
- ir_commit_internal(ir);
2618
- ir->do_close(ir);
2619
- store_deref(ir->store);
2620
- if (ir->is_owner) {
2621
- sis_destroy(ir->sis);
2622
- }
2623
- if (ir->cache) {
2624
- h_destroy(ir->cache);
2625
- }
2626
- if (ir->sort_cache) {
2627
- h_destroy(ir->sort_cache);
2628
- }
2629
-
2630
- mutex_destroy(&ir->mutex);
2631
- free(ir);
2632
- }
2633
-
2634
- /**
2635
- * Don't call this method if the cache already exists
2636
- **/
2637
- void ir_add_cache(IndexReader *ir)
2638
- {
2639
- ir->cache = co_hsh_create();
2640
- }
2641
-
2642
- bool ir_is_latest(IndexReader *ir)
2643
- {
2644
- bool is_latest = false;
2645
- Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2646
- if (!commit_lock->obtain(commit_lock)) {
2647
- ir->store->close_lock(commit_lock);
2648
- RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
2649
- }
2650
- TRY
2651
- is_latest = (sis_read_current_version(ir->store) == ir->sis->version);
2652
- XFINALLY
2653
- commit_lock->release(commit_lock);
2654
- ir->store->close_lock(commit_lock);
2655
- XENDTRY
2656
- return is_latest;
2657
- }
2658
-