ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/index.h CHANGED
@@ -1,35 +1,40 @@
1
1
  #ifndef FRT_INDEX_H
2
2
  #define FRT_INDEX_H
3
3
 
4
- #include <limits.h>
5
4
  #include "global.h"
6
- #include "array.h"
7
- #include "bitvector.h"
8
- #include "hashset.h"
9
- #include "priorityqueue.h"
10
- #include "hash.h"
11
- #include "store.h"
12
5
  #include "document.h"
13
6
  #include "analysis.h"
7
+ #include "hash.h"
8
+ #include "hashset.h"
9
+ #include "store.h"
10
+ #include "mem_pool.h"
14
11
  #include "similarity.h"
12
+ #include "bitvector.h"
13
+ #include "priorityqueue.h"
15
14
 
15
+ typedef struct IndexReader IndexReader;
16
+ typedef struct MultiReader MultiReader;
16
17
 
17
- #define SEGMENT_NAME_MAX_LENGTH 100
18
- #define NOT_A_FIELD 0xFFFFFFFF
19
-
20
- typedef struct Config {
21
- int merge_factor;
22
- int min_merge_docs;
23
- int max_merge_docs;
24
- int max_field_length;
25
- int term_index_interval;
26
- } FerretConfig;
18
+ /****************************************************************************
19
+ *
20
+ * Config
21
+ *
22
+ ****************************************************************************/
27
23
 
28
- extern FerretConfig config;
24
+ typedef struct Config
25
+ {
26
+ int chunk_size;
27
+ int max_buffer_memory;
28
+ int index_interval;
29
+ int skip_interval;
30
+ int merge_factor;
31
+ int max_buffered_docs;
32
+ int max_merge_docs;
33
+ int max_field_length;
34
+ bool use_compound_file;
35
+ } Config;
29
36
 
30
- typedef struct IndexReader IndexReader;
31
- typedef struct IndexWriter IndexWriter;
32
- typedef struct SegmentReader SegmentReader;
37
+ extern const Config default_config;
33
38
 
34
39
  /***************************************************************************
35
40
  *
@@ -38,20 +43,18 @@ typedef struct SegmentReader SegmentReader;
38
43
  ***************************************************************************/
39
44
 
40
45
  typedef struct CacheObject {
41
- HshTable *ref_tab1;
42
- HshTable *ref_tab2;
43
- void *ref1;
44
- void *ref2;
45
- void *obj;
46
- void (*destroy)(void *p);
46
+ HashTable *ref_tab1;
47
+ HashTable *ref_tab2;
48
+ void *ref1;
49
+ void *ref2;
50
+ void *obj;
51
+ void (*destroy)(void *p);
47
52
  } CacheObject;
48
53
 
49
- void cache_destroy(CacheObject *co);
50
- CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
51
- void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
52
- unsigned int co_hash(const void *key);
53
- int co_eq(const void *key1, const void *key2);
54
- HshTable *co_hsh_create();
54
+ extern void cache_destroy(CacheObject *co);
55
+ extern CacheObject *co_create(HashTable *ref_tab1, HashTable *ref_tab2,
56
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
57
+ extern HashTable *co_hash_create();
55
58
 
56
59
  /****************************************************************************
57
60
  *
@@ -59,384 +62,310 @@ HshTable *co_hsh_create();
59
62
  *
60
63
  ****************************************************************************/
61
64
 
62
- typedef struct FieldInfo {
63
- char *name;
64
- int number;
65
- bool is_indexed : 1;
66
- bool store_tv : 1;
67
- bool store_offset : 1;
68
- bool store_pos : 1;
69
- bool omit_norms : 1;
70
- } FieldInfo;
71
-
72
- FieldInfo *fi_create(char *name,
73
- int number,
74
- bool is_indexed,
75
- bool store_tv,
76
- bool store_pos,
77
- bool store_offset,
78
- bool omit_norms);
79
- void fi_destroy(FieldInfo *fi);
80
-
81
- /****************************************************************************
82
- *
83
- * FieldInfos
84
- *
85
- ****************************************************************************/
86
-
87
- typedef struct FieldInfos {
88
- HashEntry **by_name;
89
- FieldInfo **by_number;
90
- int fcnt;
91
- } FieldInfos;
65
+ enum StoreValues
66
+ {
67
+ STORE_NO = 0,
68
+ STORE_YES = 1,
69
+ STORE_COMPRESS = 2
70
+ };
92
71
 
93
- FieldInfos *fis_create();
94
- FieldInfos *fis_open(Store *store, char *filename);
95
- void fis_destroy(FieldInfos *fis);
96
- FieldInfo *fis_add(FieldInfos *fis,
97
- char *name,
98
- bool is_indexed,
99
- bool store_tv,
100
- bool store_offset,
101
- bool store_pos,
102
- bool omit_norms);
103
-
104
- void fis_add_fields(FieldInfos *fis,
105
- HashSet *field_names,
106
- bool is_indexed,
107
- bool store_tv,
108
- bool store_offset,
109
- bool store_pos,
110
- bool omit_norms);
111
- bool fis_has_vectors(FieldInfos *fis);
112
- void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
113
- FieldInfos *fis_read(FieldInfos *fis, InStream *is);
114
- FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
115
- ullong fis_get_number(FieldInfos *fis, char *name);
116
- FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
117
- bool fis_reorder_required(FieldInfos *fis, Document *doc);
72
+ enum IndexValues
73
+ {
74
+ INDEX_NO = 0,
75
+ INDEX_YES = 1,
76
+ INDEX_UNTOKENIZED = 3,
77
+ INDEX_YES_OMIT_NORMS = 5,
78
+ INDEX_UNTOKENIZED_OMIT_NORMS = 7
79
+ };
118
80
 
119
- /****************************************************************************
120
- *
121
- * TermBuffer
122
- *
123
- ****************************************************************************/
81
+ enum TermVectorValues
82
+ {
83
+ TERM_VECTOR_NO = 0,
84
+ TERM_VECTOR_YES = 1,
85
+ TERM_VECTOR_WITH_POSITIONS = 3,
86
+ TERM_VECTOR_WITH_OFFSETS = 5,
87
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
88
+ };
124
89
 
125
- typedef struct TermBuffer {
126
- char *field;
127
- char text[MAX_WORD_SIZE];
128
- } TermBuffer;
90
+ #define FI_IS_STORED_BM 0x001
91
+ #define FI_IS_COMPRESSED_BM 0x002
92
+ #define FI_IS_INDEXED_BM 0x004
93
+ #define FI_IS_TOKENIZED_BM 0x008
94
+ #define FI_OMIT_NORMS_BM 0x010
95
+ #define FI_STORE_TERM_VECTOR_BM 0x020
96
+ #define FI_STORE_POSITIONS_BM 0x040
97
+ #define FI_STORE_OFFSETS_BM 0x080
98
+
99
+ typedef struct FieldInfo
100
+ {
101
+ char *name;
102
+ float boost;
103
+ unsigned int bits;
104
+ int number;
105
+ int ref_cnt;
106
+ } FieldInfo;
129
107
 
130
- TermBuffer *tb_create();
131
- void tb_destroy(TermBuffer *tb);
132
- TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
133
- Term *tb_get_term(TermBuffer *tb);
134
- int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
135
- int tb_term_cmp(TermBuffer *tb, Term *t);
136
- TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
137
- TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
108
+ extern FieldInfo *fi_new(const char *name,
109
+ enum StoreValues store,
110
+ enum IndexValues index,
111
+ enum TermVectorValues term_vector);
112
+ extern char *fi_to_s(FieldInfo *fi);
113
+ extern void fi_deref(FieldInfo *fi);
114
+
115
+ #define fi_is_stored(fi) (((fi)->bits & FI_IS_STORED_BM) != 0)
116
+ #define fi_is_compressed(fi) (((fi)->bits & FI_IS_COMPRESSED_BM) != 0)
117
+ #define fi_is_indexed(fi) (((fi)->bits & FI_IS_INDEXED_BM) != 0)
118
+ #define fi_is_tokenized(fi) (((fi)->bits & FI_IS_TOKENIZED_BM) != 0)
119
+ #define fi_omit_norms(fi) (((fi)->bits & FI_OMIT_NORMS_BM) != 0)
120
+ #define fi_store_term_vector(fi) (((fi)->bits & FI_STORE_TERM_VECTOR_BM) != 0)
121
+ #define fi_store_positions(fi) (((fi)->bits & FI_STORE_POSITIONS_BM) != 0)
122
+ #define fi_store_offsets(fi) (((fi)->bits & FI_STORE_OFFSETS_BM) != 0)
123
+ #define fi_has_norms(fi)\
124
+ (((fi)->bits & (FI_OMIT_NORMS_BM|FI_IS_INDEXED_BM)) == FI_IS_INDEXED_BM)
138
125
 
139
126
  /****************************************************************************
140
127
  *
141
- * TermInfo
128
+ * FieldInfos
142
129
  *
143
130
  ****************************************************************************/
144
131
 
145
- typedef struct TermInfo {
146
- int doc_freq;
147
- int freq_pointer;
148
- int prox_pointer;
149
- int skip_offset;
150
- } TermInfo;
132
+ #define FIELD_INFOS_INIT_CAPA 4
133
+ typedef struct FieldInfos
134
+ {
135
+ int store;
136
+ int index;
137
+ int term_vector;
138
+ int size;
139
+ int capa;
140
+ FieldInfo **fields;
141
+ HashTable *field_dict;
142
+ int ref_cnt;
143
+ } FieldInfos;
151
144
 
152
- TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
153
- TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
154
- void ti_destroy(TermInfo *ti);
155
- TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
156
- TermInfo *ti_clone(TermInfo *other);
157
- int ti_eq(TermInfo *ti1, TermInfo *ti2);
145
+ extern FieldInfos *fis_new(int store, int index, int term_vector);
146
+ extern FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi);
147
+ extern FieldInfo *fis_get_field(FieldInfos *fis, const char *name);
148
+ extern int fis_get_field_num(FieldInfos *fis, const char *name);
149
+ extern FieldInfo *fis_get_or_add_field(FieldInfos *fis, const char *name);
150
+ extern void fis_write(FieldInfos *fis, Store *store);
151
+ extern FieldInfos *fis_read(Store *store);
152
+ extern char *fis_to_s(FieldInfos *fis);
153
+ extern void fis_deref(FieldInfos *fis);
158
154
 
159
155
  /****************************************************************************
160
156
  *
161
- * TermEnum
157
+ * SegmentInfo
162
158
  *
163
159
  ****************************************************************************/
164
160
 
165
- typedef struct TermEnum TermEnum;
166
- struct TermEnum {
167
- void *data;
168
- TermBuffer *(*next)(TermEnum *te);
169
- void (*close)(TermEnum *te);
170
- TermEnum *(*clone)(TermEnum *te);
171
- TermBuffer *tb_curr;
172
- TermBuffer *tb_prev;
173
- TermInfo *ti_curr;
174
- };
175
-
176
- TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
177
-
178
- Term *te_get_term(struct TermEnum *te);
179
- TermInfo *te_get_ti(struct TermEnum *te);
180
-
181
- /* * SegmentTermEnum * */
182
-
183
- typedef struct SegmentTermEnum {
184
- FieldInfos *fis;
185
- int is_index;
186
- InStream *is;
187
- int size;
188
- int pos;
189
- int index_pointer;
190
- int index_interval;
191
- int skip_interval;
192
- int format_m1skip_interval;
193
- int format;
194
- } SegmentTermEnum;
195
-
196
-
197
- TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
198
- TermBuffer *ste_next(struct TermEnum *te);
199
- void ste_close(struct TermEnum *te);
200
-
201
- /* * MultiTermEnum * */
161
+ #define SEGMENT_NAME_MAX_LENGTH 100
202
162
 
203
- typedef struct MultiTermEnum {
204
- int doc_freq;
205
- PriorityQueue *smi_queue;
206
- } MultiTermEnum;
163
+ typedef struct SegmentInfo
164
+ {
165
+ char *name;
166
+ int doc_cnt;
167
+ Store *store;
168
+ } SegmentInfo;
207
169
 
208
- TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
170
+ extern SegmentInfo *si_new(char *name, int doc_cnt, Store *store);
171
+ extern void si_destroy(SegmentInfo *si);
172
+ extern bool si_has_deletions(SegmentInfo *si);
173
+ extern bool si_uses_compound_file(SegmentInfo *si);
174
+ extern bool si_has_separate_norms(SegmentInfo *si);
209
175
 
210
176
  /****************************************************************************
211
177
  *
212
- * TermInfosWriter
178
+ * SegmentInfos
213
179
  *
214
180
  ****************************************************************************/
215
181
 
216
- #define TERM_INFO_FORMAT -2
217
-
218
- typedef struct TermInfosWriter {
219
- int index_interval;
220
- int skip_interval;
221
- int size;
222
- int last_index_pointer;
223
- bool is_index;
224
- OutStream *os;
225
- struct TermInfosWriter *other;
226
- Term *last_term;
227
- TermInfo *last_term_info;
228
- FieldInfos *fis;
229
- char *curr_field;
230
- ullong curr_field_num;
231
- } TermInfosWriter;
182
+ typedef struct SegmentInfos
183
+ {
184
+ f_u64 counter;
185
+ f_u64 version;
186
+ f_u32 format;
187
+ Store *store;
188
+ SegmentInfo **segs;
189
+ int size;
190
+ int capa;
191
+ } SegmentInfos;
232
192
 
233
- TermInfosWriter *tiw_open(Store *store,
234
- char *segment,
235
- FieldInfos *fis,
236
- int interval);
237
- void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
238
- void tiw_close(TermInfosWriter *tiw);
193
+ extern SegmentInfos *sis_new();
194
+ extern SegmentInfo *sis_new_segment(SegmentInfos *sis, int dcnt, Store *store);
195
+ extern SegmentInfo *sis_add_si(SegmentInfos *sis, SegmentInfo *si);
196
+ extern void sis_del_at(SegmentInfos *sis, int at);
197
+ extern void sis_del_from_to(SegmentInfos *sis, int from, int to);
198
+ extern void sis_clear(SegmentInfos *sis);
199
+ extern SegmentInfos *sis_read(Store *store);
200
+ extern void sis_write(SegmentInfos *sis, Store *store);
201
+ extern f_u64 sis_read_current_version(Store *store);
202
+ extern void sis_destroy(SegmentInfos *sis);
239
203
 
240
204
  /****************************************************************************
241
205
  *
242
- * TermInfosReader
206
+ * TermInfo
243
207
  *
244
208
  ****************************************************************************/
245
209
 
246
- typedef struct TermInfosReader {
247
- mutex_t mutex;
248
- TermEnum *orig_te;
249
- thread_key_t thread_te;
250
- Array *te_bucket;
251
- TermEnum *index_te;
252
- int size;
253
- int skip_interval;
254
- int index_size;
255
- Term **index_terms;
256
- TermInfo **index_term_infos;
257
- int *index_pointers;
258
- } TermInfosReader;
210
+ typedef struct TermInfo
211
+ {
212
+ int doc_freq;
213
+ off_t frq_ptr;
214
+ off_t prx_ptr;
215
+ off_t skip_offset;
216
+ } TermInfo;
259
217
 
260
- TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
261
- void tir_close(TermInfosReader *tir);
262
- Term *tir_get_term(TermInfosReader *tir, int position);
263
- int tir_get_term_pos(TermInfosReader *tir, Term *t);
264
- TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
218
+ #define ti_set(ti, mdf, mfp, mpp, mso) do {\
219
+ (ti).doc_freq = mdf;\
220
+ (ti).frq_ptr = mfp;\
221
+ (ti).prx_ptr = mpp;\
222
+ (ti).skip_offset = mso;\
223
+ } while (0)
265
224
 
266
225
  /****************************************************************************
267
226
  *
268
- * TVOffsetInfo
227
+ * TermEnum
269
228
  *
270
229
  ****************************************************************************/
271
230
 
272
- typedef struct TVOffsetInfo {
273
- int start;
274
- int end;
275
- } TVOffsetInfo;
276
-
277
- TVOffsetInfo *tvoi_create(int start, int end);
278
- void tvoi_destroy(void *p);
279
-
280
- /****************************************************************************
281
- *
282
- * TVField
283
- *
284
- ****************************************************************************/
231
+ typedef struct TermEnum TermEnum;
285
232
 
286
- typedef struct TVField {
287
- int tvf_pointer;
288
- int number;
289
- unsigned int store_positions : 1;
290
- unsigned int store_offsets : 1;
291
- } TVField;
233
+ struct TermEnum
234
+ {
235
+ char curr_term[MAX_WORD_SIZE];
236
+ char prev_term[MAX_WORD_SIZE];
237
+ TermInfo curr_ti;
238
+ int curr_term_len;
239
+ int field_num;
240
+ TermEnum *(*set_field)(TermEnum *te, int field_num);
241
+ char *(*next)(TermEnum *te);
242
+ char *(*skip_to)(TermEnum *te, const char *term);
243
+ void (*close)(TermEnum *te);
244
+ TermEnum *(*clone)(TermEnum *te);
245
+ };
292
246
 
293
- TVField *tvf_create(int number, int store_positions, int store_offsets);
294
- void tvf_destroy(void *p);
247
+ char *te_get_term(struct TermEnum *te);
248
+ TermInfo *te_get_ti(struct TermEnum *te);
295
249
 
296
250
  /****************************************************************************
297
251
  *
298
- * TVTerm
252
+ * SegmentTermEnum
299
253
  *
300
254
  ****************************************************************************/
301
255
 
302
- typedef struct TVTerm {
303
- int field_num;
304
- char *text;
305
- int freq;
306
- int *positions;
307
- TVOffsetInfo **offsets;
308
- } TVTerm;
256
+ /* * SegmentTermIndex * */
309
257
 
310
- TVTerm *tvt_create(char *text,
311
- int freq,
312
- int *positions,
313
- TVOffsetInfo **offsets);
314
- void tvt_destroy(void *p);
258
+ typedef struct SegmentTermIndex
259
+ {
260
+ off_t index_ptr;
261
+ off_t ptr;
262
+ int index_size;
263
+ int size;
264
+ char **index_terms;
265
+ int *index_term_lens;
266
+ TermInfo *index_term_infos;
267
+ off_t *index_ptrs;
268
+ } SegmentTermIndex;
315
269
 
316
- /****************************************************************************
317
- *
318
- * TermVector
319
- *
320
- ****************************************************************************/
270
+ /* * SegmentFieldIndex * */
321
271
 
322
- typedef struct TermVector {
323
- char *field;
324
- char **terms;
325
- int tcnt;
326
- int *freqs;
327
- int **positions;
328
- TVOffsetInfo ***offsets;
329
- } TermVector;
272
+ typedef struct SegmentTermEnum SegmentTermEnum;
330
273
 
331
- TermVector *tv_create(const char *field,
332
- char **terms,
333
- int tcnt,
334
- int *freqs,
335
- int **positions,
336
- TVOffsetInfo ***offsets);
337
- void tv_destroy(TermVector *tv);
274
+ typedef struct SegmentFieldIndex
275
+ {
276
+ mutex_t mutex;
277
+ int skip_interval;
278
+ int index_interval;
279
+ off_t index_ptr;
280
+ TermEnum *index_te;
281
+ HashTable *field_dict;
282
+ } SegmentFieldIndex;
338
283
 
339
- /****************************************************************************
340
- *
341
- * TermVectorsWriter
342
- *
343
- ****************************************************************************/
284
+ extern SegmentFieldIndex *sfi_open(Store *store, const char *segment);
285
+ extern void sfi_close(SegmentFieldIndex *sfi);
344
286
 
345
- #define STORE_POSITIONS_WITH_TERMVECTOR 0x1
346
- #define STORE_OFFSET_WITH_TERMVECTOR 0x2
347
287
 
348
- #define FORMAT_VERSION 2
349
- #define FORMAT_SIZE 4
288
+ /* * SegmentTermEnum * */
289
+ struct SegmentTermEnum
290
+ {
291
+ TermEnum te;
292
+ InStream *is;
293
+ int size;
294
+ int pos;
295
+ int skip_interval;
296
+ SegmentFieldIndex *sfi;
297
+ };
350
298
 
351
- #define TVX_EXTENSION ".tvx"
352
- #define TVD_EXTENSION ".tvd"
353
- #define TVF_EXTENSION ".tvf"
299
+ extern void ste_close(TermEnum *te);
300
+ extern TermEnum *ste_clone(TermEnum *te);
301
+ extern TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi);
354
302
 
355
- typedef struct TermVectorsWriter {
356
- TVField *curr_field;
357
- int curr_doc_pointer;
358
- OutStream *tvx;
359
- OutStream *tvd;
360
- OutStream *tvf;
361
- FieldInfos *fis;
362
- TVField **fields;
363
- int fcnt;
364
- int fsize;
365
- TVTerm **terms;
366
- int tcnt;
367
- int tsize;
368
- } TermVectorsWriter;
369
-
370
- TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
371
- void tvw_close(TermVectorsWriter *tvw);
372
- void tvw_open_doc(TermVectorsWriter *tvw);
373
- void tvw_close_doc(TermVectorsWriter *tvw);
374
- void tvw_open_field(TermVectorsWriter *tvw, char *field);
375
- void tvw_close_field(TermVectorsWriter *tvw);
376
- void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
377
- void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
303
+ /* * MultiTermEnum * */
378
304
 
305
+ extern TermEnum *mte_new(MultiReader *mr, int field_num, const char *term);
379
306
 
380
307
  /****************************************************************************
381
308
  *
382
- * TermVectorsReader
309
+ * TermInfosReader
383
310
  *
384
311
  ****************************************************************************/
385
312
 
386
- typedef struct TermVectorsReader {
387
- int size;
388
- InStream *tvx;
389
- InStream *tvd;
390
- InStream *tvf;
391
- FieldInfos *fis;
392
- int tvd_format;
393
- int tvf_format;
394
- } TermVectorsReader;
395
-
396
- TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
397
- TermVectorsReader *tvr_clone(TermVectorsReader *orig);
398
- void tvr_close(TermVectorsReader *tvr);
399
- TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
400
- char *field, int tvf_pointer);
401
- Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
402
- TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
313
+ #define TE_BUCKET_INIT_CAPA 1
403
314
 
404
- /****************************************************************************
405
- *
406
- * FieldsReader
407
- *
408
- ****************************************************************************/
409
-
410
- typedef struct FieldsReader {
411
- int len;
412
- FieldInfos *fis;
413
- InStream *fields_in;
414
- InStream *index_in;
415
- } FieldsReader;
315
+ typedef struct TermInfosReader
316
+ {
317
+ thread_key_t thread_te;
318
+ void **te_bucket;
319
+ TermEnum *orig_te;
320
+ int field_num;
321
+ } TermInfosReader;
416
322
 
417
- FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
418
- void fr_close(FieldsReader *fr);
419
- Document *fr_get_doc(FieldsReader *fr, int doc_num);
323
+ extern TermInfosReader *tir_open(Store *store,
324
+ SegmentFieldIndex *sfi,
325
+ const char *segment);
326
+ extern TermInfosReader *tir_set_field(TermInfosReader *tir, int field_num);
327
+ extern TermInfo *tir_get_ti(TermInfosReader *tir, const char *term);
328
+ extern char *tir_get_term(TermInfosReader *tir, int pos);
329
+ extern void tir_close(TermInfosReader *tir);
420
330
 
421
331
  /****************************************************************************
422
332
  *
423
- * FieldsWriter
333
+ * TermInfosWriter
424
334
  *
425
335
  ****************************************************************************/
426
336
 
427
- #define FIELD_IS_TOKENIZED 0X1
428
- #define FIELD_IS_BINARY 0X2
429
- #define FIELD_IS_COMPRESSED 0X4
337
+ #define INDEX_INTERVAL 128
338
+ #define SKIP_INTERVAL 16
430
339
 
431
- typedef struct FieldsWriter {
432
- FieldInfos *fis;
433
- OutStream *fields_out;
434
- OutStream *index_out;
435
- } FieldsWriter;
340
+ typedef struct TermWriter
341
+ {
342
+ int counter;
343
+ const char *last_term;
344
+ TermInfo last_term_info;
345
+ OutStream *os;
346
+ } TermWriter;
347
+
348
+ typedef struct TermInfosWriter
349
+ {
350
+ int field_count;
351
+ int index_interval;
352
+ int skip_interval;
353
+ off_t last_index_ptr;
354
+ OutStream *tfx_out;
355
+ TermWriter *tix_writer;
356
+ TermWriter *tis_writer;
357
+ } TermInfosWriter;
436
358
 
437
- FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
438
- void fw_close(FieldsWriter *fw);
439
- void fw_add_doc(FieldsWriter *fw, Document *doc);
359
+ extern TermInfosWriter *tiw_open(Store *store,
360
+ const char *segment,
361
+ int index_interval,
362
+ int skip_interval);
363
+ extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
364
+ extern void tiw_add(TermInfosWriter *tiw,
365
+ const char *term,
366
+ int t_len,
367
+ TermInfo *ti);
368
+ extern void tiw_close(TermInfosWriter *tiw);
440
369
 
441
370
  /****************************************************************************
442
371
  *
@@ -445,9 +374,10 @@ void fw_add_doc(FieldsWriter *fw, Document *doc);
445
374
  ****************************************************************************/
446
375
 
447
376
  typedef struct TermDocEnum TermDocEnum;
448
- struct TermDocEnum {
449
- void *data;
450
- void (*seek)(TermDocEnum *tde, Term *term);
377
+ struct TermDocEnum
378
+ {
379
+ void (*seek)(TermDocEnum *tde, int field_num, const char *term);
380
+ void (*seek_te)(TermDocEnum *tde, TermEnum *te);
451
381
  int (*doc_num)(TermDocEnum *tde);
452
382
  int (*freq)(TermDocEnum *tde);
453
383
  bool (*next)(TermDocEnum *tde);
@@ -460,71 +390,72 @@ struct TermDocEnum {
460
390
  /* * SegmentTermDocEnum * */
461
391
 
462
392
  typedef struct SegmentTermDocEnum SegmentTermDocEnum;
463
-
464
- struct SegmentTermDocEnum {
465
- SegmentReader *parent;
466
- InStream *freq_in;
393
+ struct SegmentTermDocEnum
394
+ {
395
+ TermDocEnum tde;
396
+ void (*seek_prox)(SegmentTermDocEnum *stde, int prx_ptr);
397
+ void (*skip_prox)(SegmentTermDocEnum *stde);
398
+ TermInfosReader *tir;
399
+ InStream *frq_in;
400
+ InStream *prx_in;
401
+ InStream *skip_in;
402
+ BitVector *deleted_docs;
467
403
  int count; /* number of docs for this term skipped */
468
404
  int doc_freq; /* number of doc this term appears in */
469
- BitVector *deleted_docs;
470
405
  int doc_num;
471
406
  int freq;
472
- int skip_interval;
473
407
  int num_skips;
408
+ int skip_interval;
474
409
  int skip_count;
475
- InStream *skip_in;
476
410
  int skip_doc;
477
- int freq_pointer;
478
- int prox_pointer;
479
- int skip_pointer;
480
- unsigned int have_skipped : 1;
481
- void (*skip_prox)(SegmentTermDocEnum *stde);
482
- InStream *prox_in;
483
- int prox_cnt;
411
+ int frq_ptr;
412
+ int prx_ptr;
413
+ int skip_ptr;
414
+ int prx_cnt;
484
415
  int position;
485
- void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
416
+ bool have_skipped : 1;
486
417
  };
487
418
 
488
- TermDocEnum *stde_create(IndexReader *ir);
489
- void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
419
+ extern TermDocEnum *stde_new(TermInfosReader *tir, InStream *frq_in,
420
+ BitVector *deleted_docs, int skip_interval);
490
421
 
491
- /* * SegmentTermPosEnum * */
492
- TermDocEnum *stpe_create(IndexReader *ir);
493
-
494
- /* * MultiTermDocEnum * */
495
- typedef struct MultiTermDocEnum MultiTermDocEnum;
496
- struct MultiTermDocEnum {
497
- IndexReader **irs;
498
- int *starts;
499
- int ir_cnt;
500
- Term *term;
501
- int base;
502
- int pointer;
503
- TermDocEnum **irs_tde;
504
- TermDocEnum *curr_tde;
505
- TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
506
- };
422
+ /* * SegmentTermDocEnum * */
423
+ extern TermDocEnum *stpe_new(TermInfosReader *tir, InStream *frq_in,
424
+ InStream *prx_in, BitVector *deleted_docs,
425
+ int skip_interval);
507
426
 
508
- TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
427
+ /****************************************************************************
428
+ * MultipleTermDocPosEnum
429
+ ****************************************************************************/
509
430
 
510
- /* * MultiTermPosEnum * */
511
- TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
431
+ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
432
+ int t_cnt);
512
433
 
513
434
  /****************************************************************************
514
- * MultipleTermDocPosEnum
435
+ *
436
+ * Offset
437
+ *
515
438
  ****************************************************************************/
516
439
 
517
- #define MTDPE_POS_QUEUE_INIT_CAPA 8
518
- typedef struct {
519
- int doc_num;
520
- int freq;
521
- PriorityQueue *pq;
522
- int *pos_queue;
523
- int pos_queue_index;
524
- int pos_queue_capa;
525
- } MultipleTermDocPosEnum;
440
+ typedef struct Offset
441
+ {
442
+ int start;
443
+ int end;
444
+ } Offset;
526
445
 
527
- TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
446
+ extern Offset *offset_new(int start, int end);
447
+
448
+ /****************************************************************************
449
+ *
450
+ * Occurence
451
+ *
452
+ ****************************************************************************/
453
+
454
+ typedef struct Occurence
455
+ {
456
+ struct Occurence *next;
457
+ int pos;
458
+ } Occurence;
528
459
 
529
460
  /****************************************************************************
530
461
  *
@@ -532,283 +463,388 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
532
463
  *
533
464
  ****************************************************************************/
534
465
 
535
- typedef struct Posting {
536
- Term *term;
466
+ typedef struct Posting
467
+ {
537
468
  int freq;
538
- int size;
539
- int *positions;
540
- TVOffsetInfo **offsets;
469
+ int doc_num;
470
+ Occurence *first_occ;
471
+ struct Posting *next;
541
472
  } Posting;
542
473
 
543
- Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
544
- void p_destroy(Posting *self);
545
- void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
474
+ extern inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
546
475
 
476
+ /****************************************************************************
477
+ *
478
+ * PostingList
479
+ *
480
+ ****************************************************************************/
481
+
482
+ typedef struct PostingList
483
+ {
484
+ const char *term;
485
+ int term_len;
486
+ Posting *first;
487
+ Posting *last;
488
+ Occurence *last_occ;
489
+ } PostingList;
490
+
491
+ extern PostingList *pl_new(MemoryPool *mp, const char *term,
492
+ int term_len, Posting *p);
493
+ extern void pl_add_occ(MemoryPool *mp, PostingList *pl, int pos);
547
494
 
548
495
  /****************************************************************************
549
496
  *
550
- * DocumentWriter
497
+ * TVField
551
498
  *
552
499
  ****************************************************************************/
553
500
 
554
- typedef struct DocumentWriter {
555
- Store *store;
556
- Analyzer *analyzer;
557
- Similarity *similarity;
558
- HshTable *postingtable;
559
- int pcnt;
560
- FieldInfos *fis;
561
- float *field_boosts;
562
- int *field_lengths;
563
- int *field_positions;
564
- int *field_offsets;
565
- int max_field_length;
566
- int term_index_interval;
567
- } DocumentWriter;
501
+ typedef struct TVField
502
+ {
503
+ int field_num;
504
+ int size;
505
+ } TVField;
506
+
507
+ /****************************************************************************
508
+ *
509
+ * TVTerm
510
+ *
511
+ ****************************************************************************/
568
512
 
569
- DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
570
- Similarity *similarity, int max_field_length, int term_index_interval);
571
- void dw_close(DocumentWriter *dw);
572
- void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
513
+ typedef struct TVTerm
514
+ {
515
+ char *text;
516
+ int freq;
517
+ int *positions;
518
+ } TVTerm;
573
519
 
574
520
  /****************************************************************************
575
521
  *
576
- * SegmentInfo
522
+ * TermVector
577
523
  *
578
524
  ****************************************************************************/
579
525
 
580
- typedef struct SegmentInfo {
581
- char *name;
582
- int doc_cnt;
583
- Store *store;
584
- } SegmentInfo;
526
+ typedef struct TermVector
527
+ {
528
+ int field_num;
529
+ char *field;
530
+ int term_cnt;
531
+ TVTerm *terms;
532
+ int offset_cnt;
533
+ Offset *offsets;
534
+ } TermVector;
585
535
 
586
- SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
587
- void si_destroy(SegmentInfo *si);
588
- bool si_has_deletions(SegmentInfo *si);
589
- bool si_uses_compound_file(SegmentInfo *si);
590
- bool si_has_separate_norms(SegmentInfo *si);
536
+ extern void tv_destroy(TermVector *tv);
537
+ extern int tv_get_tv_term_index(TermVector *tv, const char *term);
538
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term);
591
539
 
592
540
  /****************************************************************************
593
541
  *
594
- * SegmentInfos
542
+ * TermVectorsWriter
595
543
  *
596
544
  ****************************************************************************/
597
545
 
598
- typedef struct SegmentInfos {
599
- Store *store;
600
- SegmentInfo **segs;
601
- int scnt;
602
- int size;
603
- int counter;
604
- int version;
605
- int format;
606
- } SegmentInfos;
546
+ #define TV_FIELD_INIT_CAPA 8
547
+
548
+ typedef struct TermVectorsWriter
549
+ {
550
+ OutStream *tvx_out;
551
+ OutStream *tvd_out;
552
+ FieldInfos *fis;
553
+ TVField *fields;
554
+ off_t tvd_ptr;
555
+ } TermVectorsWriter;
607
556
 
608
- SegmentInfos *sis_create();
609
- void sis_destroy(SegmentInfos *sis);
610
- void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
611
- void sis_del_at(SegmentInfos *sis, int at);
612
- void sis_del_from_to(SegmentInfos *sis, int from, int to);
613
- void sis_clear(SegmentInfos *sis);
614
- void sis_read(SegmentInfos *sis, Store *store);
615
- void sis_write(SegmentInfos *sis, Store *store);
616
- int sis_read_current_version(Store *store);
557
+ extern TermVectorsWriter *tvw_open(Store *store,
558
+ const char *segment,
559
+ FieldInfos *fis);
560
+ extern void tvw_open_doc(TermVectorsWriter *tvw);
561
+ extern void tvw_close_doc(TermVectorsWriter *tvw);
562
+ extern void tvw_add_postings(TermVectorsWriter *tvw,
563
+ int field_num,
564
+ PostingList **plists,
565
+ int posting_count,
566
+ Offset *offsets,
567
+ int offset_count);
568
+ extern void tvw_close(TermVectorsWriter *tvw);
617
569
 
618
570
  /****************************************************************************
619
571
  *
620
- * IndexReader
572
+ * TermVectorsReader
621
573
  *
622
574
  ****************************************************************************/
623
575
 
624
- enum FIELD_TYPE {
625
- /* all fields */
626
- IR_ALL,
627
- /* all indexed fields */
628
- IR_INDEXED,
629
- /* all fields which are not indexed */
630
- IR_UNINDEXED,
631
- /* all fields which are indexed with termvectors enables */
632
- IR_INDEXED_WITH_TERM_VECTOR,
633
- /* all fields which are indexed but don't have termvectors enabled */
634
- IR_INDEXED_NO_TERM_VECTOR,
635
- /* all fields where termvectors are enabled. Please note that only standard */
636
- /* termvector fields are returned */
637
- IR_TERM_VECTOR,
638
- /* all field with termvectors wiht positions enabled */
639
- IR_TERM_VECTOR_WITH_POSITION,
640
- /* all fields where termvectors with offset position are set */
641
- IR_TERM_VECTOR_WITH_OFFSET,
642
- /* all fields where termvectors with offset and position values set */
643
- IR_TERM_VECTOR_WITH_POSITION_OFFSET
644
- };
576
+ typedef struct TermVectorsReader
577
+ {
578
+ int size;
579
+ InStream *tvx_in;
580
+ InStream *tvd_in;
581
+ FieldInfos *fis;
582
+ } TermVectorsReader;
645
583
 
646
- struct IndexReader {
647
- mutex_t mutex;
648
- HshTable *cache;
649
- HshTable *sort_cache;
650
- void *data;
651
- Store *store;
652
- Lock *write_lock;
653
- SegmentInfos *sis;
654
- bool has_changes : 1;
655
- bool is_stale : 1;
656
- bool is_owner : 1;
657
- TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
658
- Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
659
- int (*num_docs)(IndexReader *ir);
660
- int (*max_doc)(IndexReader *ir);
661
- Document *(*get_doc)(IndexReader *ir, int doc_num);
662
- uchar *(*get_norms)(IndexReader *ir, char *field);
663
- uchar *(*get_norms_always)(IndexReader *ir, char *field);
664
- void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
665
- uchar val);
666
- void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf,
667
- int offset);
668
- TermEnum *(*terms)(IndexReader *ir);
669
- TermEnum *(*terms_from)(IndexReader *ir, Term *term);
670
- int (*doc_freq)(IndexReader *ir, Term *t);
671
- TermDocEnum *(*term_docs)(IndexReader *ir);
672
- TermDocEnum *(*term_positions)(IndexReader *ir);
673
- void (*do_delete_doc)(IndexReader *ir, int doc_num);
674
- void (*do_undelete_all)(IndexReader *ir);
675
- bool (*is_deleted)(IndexReader *ir, int doc_num);
676
- bool (*has_deletions)(IndexReader *ir);
677
- bool (*has_norms)(IndexReader *ir, char *field);
678
- HashSet *(*get_field_names)(IndexReader *ir, int field_type);
679
- void (*do_commit)(IndexReader *ir);
680
- void (*do_close)(IndexReader *ir);
681
- void (*acquire_write_lock)(IndexReader *ir);
584
+ extern TermVectorsReader *tvr_open(Store *store,
585
+ const char *segment,
586
+ FieldInfos *fis);
587
+ extern TermVectorsReader *tvr_clone(TermVectorsReader *orig);
588
+ extern void tvr_close(TermVectorsReader *tvr);
589
+ extern HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
590
+ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
591
+ int doc_num,
592
+ int field_num);
593
+
594
+ /****************************************************************************
595
+ *
596
+ * LazyDoc
597
+ *
598
+ ****************************************************************************/
599
+
600
+ /* * * LazyDocField * * */
601
+ typedef struct LazyDocFieldData
602
+ {
603
+ int start;
604
+ int length;
605
+ char *text;
606
+ } LazyDocFieldData;
607
+
608
+ typedef struct LazyDoc LazyDoc;
609
+ typedef struct LazyDocField
610
+ {
611
+ char *name;
612
+ int size; /* number of data elements */
613
+ LazyDocFieldData *data;
614
+ int len; /* length of data elements concatenated */
615
+ LazyDoc *doc;
616
+ } LazyDocField;
617
+
618
+ extern char *lazy_df_get_data(LazyDocField *self, int i);
619
+ extern void lazy_df_get_bytes(LazyDocField *self, char *buf,
620
+ int start, int len);
621
+
622
+ /* * * LazyDoc * * */
623
+ struct LazyDoc
624
+ {
625
+ HashTable *field_dict;
626
+ int size;
627
+ LazyDocField **fields;
628
+ InStream *fields_in;
682
629
  };
683
630
 
684
- IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
685
- IndexReader *ir_open(Store *store);
686
- bool ir_index_exists(Store *store);
687
- void ir_close(IndexReader *ir);
688
- void ir_commit(IndexReader *ir);
689
- void ir_delete_doc(IndexReader *ir, int doc_num);
690
- void ir_undelete_all(IndexReader *ir);
691
- void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
692
- void ir_destroy(IndexReader *self);
693
- Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
694
- TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
695
- TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
696
- void ir_add_cache(IndexReader *ir);
697
- bool ir_is_latest(IndexReader *ir);
631
+ extern void lazy_doc_close(LazyDoc *self);
698
632
 
699
633
  /****************************************************************************
700
634
  *
701
- * Norm
635
+ * FieldsReader
702
636
  *
703
637
  ****************************************************************************/
704
638
 
705
- typedef struct Norm {
706
- bool is_dirty : 1;
707
- int field_num;
708
- InStream *is;
709
- uchar *bytes;
710
- } Norm;
639
+ typedef struct FieldsReader
640
+ {
641
+ int size;
642
+ FieldInfos *fis;
643
+ Store *store;
644
+ InStream *fdx_in;
645
+ InStream *fdt_in;
646
+ } FieldsReader;
647
+
648
+ extern FieldsReader *fr_open(Store *store,
649
+ const char *segment, FieldInfos *fis);
650
+ extern FieldsReader *fr_clone(FieldsReader *orig);
651
+ extern void fr_close(FieldsReader *fr);
652
+ extern Document *fr_get_doc(FieldsReader *fr, int doc_num);
653
+ extern LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num);
654
+ extern HashTable *fr_get_tv(FieldsReader *fr, int doc_num);
655
+ extern TermVector *fr_get_field_tv(FieldsReader *fr, int doc_num,
656
+ int field_num);
711
657
 
712
658
  /****************************************************************************
713
659
  *
714
- * SegmentReader
660
+ * FieldsWriter
715
661
  *
716
662
  ****************************************************************************/
717
663
 
718
- struct SegmentReader {
664
+ typedef struct FieldsWriter
665
+ {
719
666
  FieldInfos *fis;
720
- FieldsReader *fr;
721
- char *segment;
722
- BitVector *deleted_docs;
723
- bool deleted_docs_dirty : 1;
724
- bool undelete_all : 1;
725
- bool norms_dirty : 1;
726
- InStream *freq_in;
727
- InStream *prox_in;
728
- TermInfosReader *tir;
729
- TermVectorsReader *orig_tvr;
730
- thread_key_t thread_tvr;
731
- Array *tvr_bucket;
732
- HshTable *norms;
733
- Store *cfs_store;
734
- uchar *fake_norms;
735
- };
667
+ OutStream *fdt_out;
668
+ OutStream *fdx_out;
669
+ TVField *tv_fields;
670
+ off_t start_ptr;
671
+ } FieldsWriter;
736
672
 
737
- IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner);
738
- IndexReader *sr_open_si(SegmentInfo *si);
673
+ extern FieldsWriter *fw_open(Store *store,
674
+ const char *segment, FieldInfos *fis);
675
+ extern void fw_close(FieldsWriter *fw);
676
+ extern void fw_add_doc(FieldsWriter *fw, Document *doc);
677
+ extern void fw_add_postings(FieldsWriter *fw,
678
+ int field_num,
679
+ PostingList **plists,
680
+ int posting_count,
681
+ Offset *offsets,
682
+ int offset_count);
683
+ extern void fw_write_tv_index(FieldsWriter *fw);
739
684
 
740
685
  /****************************************************************************
741
686
  *
742
- * MultiReader
687
+ * IndexReader
743
688
  *
744
689
  ****************************************************************************/
745
690
 
746
- typedef struct MultiReader {
747
- bool has_deletions : 1;
691
+ #define WRITE_LOCK_NAME "write"
692
+ #define COMMIT_LOCK_NAME "commit"
693
+
694
+ struct IndexReader
695
+ {
696
+ int (*num_docs)(IndexReader *ir);
697
+ int (*max_doc)(IndexReader *ir);
698
+ Document *(*get_doc)(IndexReader *ir, int doc_num);
699
+ LazyDoc *(*get_lazy_doc)(IndexReader *ir, int doc_num);
700
+ uchar *(*get_norms)(IndexReader *ir, int field_num);
701
+ uchar *(*get_norms_into)(IndexReader *ir, int field_num,
702
+ uchar *buf);
703
+ TermEnum *(*terms)(IndexReader *ir, int field_num);
704
+ TermEnum *(*terms_from)(IndexReader *ir, int field_num,
705
+ const char *term);
706
+ int (*doc_freq)(IndexReader *ir, int field_num,
707
+ const char *term);
708
+ TermDocEnum *(*term_docs)(IndexReader *ir);
709
+ TermDocEnum *(*term_positions)(IndexReader *ir);
710
+ TermVector *(*term_vector)(IndexReader *ir, int doc_num,
711
+ const char *field);
712
+ HashTable *(*term_vectors)(IndexReader *ir, int doc_num);
713
+ bool (*is_deleted)(IndexReader *ir, int doc_num);
714
+ bool (*has_deletions)(IndexReader *ir);
715
+ void (*acquire_write_lock)(IndexReader *ir);
716
+ void (*set_norm_i)(IndexReader *ir, int doc_num, int field_num,
717
+ uchar val);
718
+ void (*delete_doc_i)(IndexReader *ir, int doc_num);
719
+ void (*undelete_all_i)(IndexReader *ir);
720
+ void (*commit_i)(IndexReader *ir);
721
+ void (*close_i)(IndexReader *ir);
722
+ int ref_cnt;
723
+ Store *store;
724
+ Lock *write_lock;
725
+ SegmentInfos *sis;
726
+ FieldInfos *fis;
727
+ HashTable *cache;
728
+ HashTable *sort_cache;
729
+ uchar *fake_norms;
730
+ mutex_t mutex;
731
+ bool has_changes : 1;
732
+ bool is_stale : 1;
733
+ bool is_owner : 1;
734
+ };
735
+
736
+ extern IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
737
+ extern IndexReader *ir_open(Store *store);
738
+ extern int ir_get_field_num(IndexReader *ir, const char *field);
739
+ extern bool ir_index_exists(Store *store);
740
+ extern void ir_close(IndexReader *ir);
741
+ extern void ir_commit(IndexReader *ir);
742
+ extern void ir_delete_doc(IndexReader *ir, int doc_num);
743
+ extern void ir_undelete_all(IndexReader *ir);
744
+ extern int ir_doc_freq(IndexReader *ir, const char *field, const char *term);
745
+ extern void ir_set_norm(IndexReader *ir, int doc_num, const char *field,
746
+ uchar val);
747
+ extern uchar *ir_get_norms(IndexReader *ir, const char *field);
748
+ extern uchar *ir_get_norms_into(IndexReader *ir, const char *field, uchar *buf);
749
+ extern void ir_destroy(IndexReader *self);
750
+ extern Document *ir_get_doc_with_term(IndexReader *ir, const char *field,
751
+ const char *term);
752
+ extern TermEnum *ir_terms(IndexReader *ir, const char *field);
753
+ extern TermEnum *ir_terms_from(IndexReader *ir, const char *field,
754
+ const char *t);
755
+ extern TermDocEnum *ir_term_docs_for(IndexReader *ir, const char *field,
756
+ const char *term);
757
+ extern TermDocEnum *ir_term_positions_for(IndexReader *ir, const char *fld,
758
+ const char *t);
759
+ extern void ir_add_cache(IndexReader *ir);
760
+ extern bool ir_is_latest(IndexReader *ir);
761
+
762
+ /****************************************************************************
763
+ * MultiReader
764
+ ****************************************************************************/
765
+
766
+ struct MultiReader {
767
+ IndexReader ir;
748
768
  int max_doc;
749
769
  int num_docs_cache;
750
- int rcnt;
770
+ int r_cnt;
751
771
  int *starts;
752
772
  IndexReader **sub_readers;
753
- HshTable *norms_cache;
754
- } MultiReader;
773
+ HashTable *norms_cache;
774
+ bool has_deletions : 1;
775
+ int **field_num_map;
776
+ };
777
+
778
+ extern int mr_get_field_num(MultiReader *mr, int ir_num, int f_num);
779
+ extern IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt);
755
780
 
756
- IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
757
- int rcnt);
758
781
 
759
782
  /****************************************************************************
760
783
  *
761
- * SegmentMergeInfo
784
+ * Boost
762
785
  *
763
786
  ****************************************************************************/
764
787
 
765
- typedef struct SegmentMergeInfo {
766
- int base;
767
- IndexReader *ir;
768
- TermEnum *te;
769
- TermBuffer *tb;
770
- TermDocEnum *postings;
771
- int *doc_map;
772
- } SegmentMergeInfo;
788
+ typedef struct Boost
789
+ {
790
+ float val;
791
+ int doc_num;
792
+ struct Boost *next;
793
+ } Boost;
773
794
 
774
- SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
775
- void smi_destroy(SegmentMergeInfo *smi);
776
- TermBuffer *smi_next(SegmentMergeInfo *smi);
777
- bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2);
795
+ /****************************************************************************
796
+ *
797
+ * FieldInverter
798
+ *
799
+ ****************************************************************************/
800
+
801
+ typedef struct FieldInverter
802
+ {
803
+ HashTable *plists;
804
+ uchar *norms;
805
+ FieldInfo *fi;
806
+ int length;
807
+ bool is_tokenized : 1;
808
+ bool store_term_vector : 1;
809
+ bool store_offsets : 1;
810
+ bool has_norms : 1;
811
+ } FieldInverter;
778
812
 
779
813
  /****************************************************************************
780
814
  *
781
- * SegmentMerger
815
+ * DocWriter
782
816
  *
783
817
  ****************************************************************************/
784
818
 
785
- typedef struct SegmentMerger {
819
+ #define DW_OFFSET_INIT_CAPA 512
820
+ typedef struct IndexWriter IndexWriter;
821
+
822
+ typedef struct DocWriter
823
+ {
786
824
  Store *store;
787
- char *name;
788
- Array *readers;
825
+ const char *segment;
789
826
  FieldInfos *fis;
790
- OutStream *freq_out;
791
- OutStream *prox_out;
792
- TermInfosWriter *tiw;
793
- Term *terms_buf;
794
- int terms_buf_pointer;
795
- int terms_buf_size;
796
- PriorityQueue *queue;
797
- TermInfo *ti;
798
- int term_index_interval;
799
- OutStream *skip_buffer;
827
+ TermVectorsWriter *tvw;
828
+ FieldsWriter *fw;
829
+ MemoryPool *mp;
830
+ Analyzer *analyzer;
831
+ HashTable *curr_plists;
832
+ HashTable *fields;
833
+ Similarity *similarity;
834
+ Offset *offsets;
835
+ int offsets_size;
836
+ int offsets_capa;
837
+ int doc_num;
838
+ int index_interval;
800
839
  int skip_interval;
801
- int last_skip_doc;
802
- int last_skip_freq_pointer;
803
- int last_skip_prox_pointer;
804
- } SegmentMerger;
805
-
806
- SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
807
- void sm_destroy(SegmentMerger *sm);
808
- void sm_add(SegmentMerger *sm, IndexReader *ir);
809
- int sm_merge(SegmentMerger *sm);
810
- Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
840
+ int max_field_length;
841
+ int max_buffered_docs;
842
+ } DocWriter;
811
843
 
844
+ extern DocWriter *dw_open(IndexWriter *is, const char *segment);
845
+ extern void dw_close(DocWriter *dw);
846
+ extern void dw_add_doc(DocWriter *dw, Document *doc);
847
+ extern void dw_new_segment(DocWriter *dw, char *segment);
812
848
 
813
849
  /****************************************************************************
814
850
  *
@@ -816,35 +852,38 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
816
852
  *
817
853
  ****************************************************************************/
818
854
 
819
- #define WRITE_LOCK_NAME "write"
820
- #define COMMIT_LOCK_NAME "commit"
821
- struct IndexWriter {
855
+ typedef struct DelTerm
856
+ {
857
+ int field_num;
858
+ char *term;
859
+ } DelTerm;
860
+
861
+ struct IndexWriter
862
+ {
863
+ Config config;
822
864
  mutex_t mutex;
823
- HshTable *postings;
824
- FieldInfos *fis;
825
- int merge_factor;
826
- int min_merge_docs;
827
- int max_merge_docs;
828
- int max_field_length;
829
- int term_index_interval;
830
865
  Store *store;
831
866
  Analyzer *analyzer;
832
- Similarity *similarity;
833
867
  SegmentInfos *sis;
834
- Store *ram_store;
868
+ FieldInfos *fis;
869
+ DocWriter *dw;
870
+ Similarity *similarity;
871
+ DelTerm **del_terms;
835
872
  Lock *write_lock;
836
- bool use_compound_file : 1;
837
873
  };
838
874
 
839
- IndexWriter *iw_open(Store *store, Analyzer *analyzer,
840
- bool create);
841
- void iw_flush_ram_segments(IndexWriter *iw);
842
- void iw_close(IndexWriter *iw);
843
- int iw_doc_count(IndexWriter *iw);
844
- void iw_add_doc(IndexWriter *iw, Document *doc);
845
- void iw_optimize(IndexWriter *iw);
846
- void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
847
- void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
875
+ extern void index_create(Store *store, FieldInfos *fis);
876
+ extern IndexWriter *iw_open(Store *store, Analyzer *analyzer,
877
+ const Config *config);
878
+ extern void iw_delete_term(IndexWriter *iw, const char *field,
879
+ const char *term);
880
+ extern void iw_close(IndexWriter *iw);
881
+ extern void iw_add_doc(IndexWriter *iw, Document *doc);
882
+ extern int iw_doc_count(IndexWriter *iw);
883
+ extern void iw_commit(IndexWriter *iw);
884
+ extern void iw_optimize(IndexWriter *iw);
885
+ extern void iw_add_readers(IndexWriter *iw, IndexReader **readers,
886
+ const int r_cnt);
848
887
 
849
888
  /****************************************************************************
850
889
  *
@@ -852,16 +891,24 @@ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
852
891
  *
853
892
  ****************************************************************************/
854
893
 
894
+ #define CW_INIT_CAPA 16
895
+ typedef struct CWFileEntry
896
+ {
897
+ char *name;
898
+ off_t dir_offset;
899
+ off_t data_offset;
900
+ } CWFileEntry;
901
+
855
902
  typedef struct CompoundWriter {
856
903
  Store *store;
857
904
  const char *name;
858
905
  HashSet *ids;
859
- Array *file_entries;
860
- bool merged;
906
+ CWFileEntry *file_entries;
861
907
  } CompoundWriter;
862
908
 
863
- CompoundWriter *open_cw(Store *store, char *name);
864
- void cw_add_file(CompoundWriter *cw, char *id);
865
- void cw_close(CompoundWriter *cw);
909
+ extern CompoundWriter *open_cw(Store *store, char *name);
910
+ extern void cw_add_file(CompoundWriter *cw, char *id);
911
+ extern void cw_close(CompoundWriter *cw);
912
+
866
913
 
867
914
  #endif