ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/index.h CHANGED
@@ -1,35 +1,40 @@
1
1
  #ifndef FRT_INDEX_H
2
2
  #define FRT_INDEX_H
3
3
 
4
- #include <limits.h>
5
4
  #include "global.h"
6
- #include "array.h"
7
- #include "bitvector.h"
8
- #include "hashset.h"
9
- #include "priorityqueue.h"
10
- #include "hash.h"
11
- #include "store.h"
12
5
  #include "document.h"
13
6
  #include "analysis.h"
7
+ #include "hash.h"
8
+ #include "hashset.h"
9
+ #include "store.h"
10
+ #include "mem_pool.h"
14
11
  #include "similarity.h"
12
+ #include "bitvector.h"
13
+ #include "priorityqueue.h"
15
14
 
15
+ typedef struct IndexReader IndexReader;
16
+ typedef struct MultiReader MultiReader;
16
17
 
17
- #define SEGMENT_NAME_MAX_LENGTH 100
18
- #define NOT_A_FIELD 0xFFFFFFFF
19
-
20
- typedef struct Config {
21
- int merge_factor;
22
- int min_merge_docs;
23
- int max_merge_docs;
24
- int max_field_length;
25
- int term_index_interval;
26
- } FerretConfig;
18
+ /****************************************************************************
19
+ *
20
+ * Config
21
+ *
22
+ ****************************************************************************/
27
23
 
28
- extern FerretConfig config;
24
+ typedef struct Config
25
+ {
26
+ int chunk_size;
27
+ int max_buffer_memory;
28
+ int index_interval;
29
+ int skip_interval;
30
+ int merge_factor;
31
+ int max_buffered_docs;
32
+ int max_merge_docs;
33
+ int max_field_length;
34
+ bool use_compound_file;
35
+ } Config;
29
36
 
30
- typedef struct IndexReader IndexReader;
31
- typedef struct IndexWriter IndexWriter;
32
- typedef struct SegmentReader SegmentReader;
37
+ extern const Config default_config;
33
38
 
34
39
  /***************************************************************************
35
40
  *
@@ -38,20 +43,18 @@ typedef struct SegmentReader SegmentReader;
38
43
  ***************************************************************************/
39
44
 
40
45
  typedef struct CacheObject {
41
- HshTable *ref_tab1;
42
- HshTable *ref_tab2;
43
- void *ref1;
44
- void *ref2;
45
- void *obj;
46
- void (*destroy)(void *p);
46
+ HashTable *ref_tab1;
47
+ HashTable *ref_tab2;
48
+ void *ref1;
49
+ void *ref2;
50
+ void *obj;
51
+ void (*destroy)(void *p);
47
52
  } CacheObject;
48
53
 
49
- void cache_destroy(CacheObject *co);
50
- CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
51
- void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
52
- unsigned int co_hash(const void *key);
53
- int co_eq(const void *key1, const void *key2);
54
- HshTable *co_hsh_create();
54
+ extern void cache_destroy(CacheObject *co);
55
+ extern CacheObject *co_create(HashTable *ref_tab1, HashTable *ref_tab2,
56
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
57
+ extern HashTable *co_hash_create();
55
58
 
56
59
  /****************************************************************************
57
60
  *
@@ -59,384 +62,310 @@ HshTable *co_hsh_create();
59
62
  *
60
63
  ****************************************************************************/
61
64
 
62
- typedef struct FieldInfo {
63
- char *name;
64
- int number;
65
- bool is_indexed : 1;
66
- bool store_tv : 1;
67
- bool store_offset : 1;
68
- bool store_pos : 1;
69
- bool omit_norms : 1;
70
- } FieldInfo;
71
-
72
- FieldInfo *fi_create(char *name,
73
- int number,
74
- bool is_indexed,
75
- bool store_tv,
76
- bool store_pos,
77
- bool store_offset,
78
- bool omit_norms);
79
- void fi_destroy(FieldInfo *fi);
80
-
81
- /****************************************************************************
82
- *
83
- * FieldInfos
84
- *
85
- ****************************************************************************/
86
-
87
- typedef struct FieldInfos {
88
- HashEntry **by_name;
89
- FieldInfo **by_number;
90
- int fcnt;
91
- } FieldInfos;
65
+ enum StoreValues
66
+ {
67
+ STORE_NO = 0,
68
+ STORE_YES = 1,
69
+ STORE_COMPRESS = 2
70
+ };
92
71
 
93
- FieldInfos *fis_create();
94
- FieldInfos *fis_open(Store *store, char *filename);
95
- void fis_destroy(FieldInfos *fis);
96
- FieldInfo *fis_add(FieldInfos *fis,
97
- char *name,
98
- bool is_indexed,
99
- bool store_tv,
100
- bool store_offset,
101
- bool store_pos,
102
- bool omit_norms);
103
-
104
- void fis_add_fields(FieldInfos *fis,
105
- HashSet *field_names,
106
- bool is_indexed,
107
- bool store_tv,
108
- bool store_offset,
109
- bool store_pos,
110
- bool omit_norms);
111
- bool fis_has_vectors(FieldInfos *fis);
112
- void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
113
- FieldInfos *fis_read(FieldInfos *fis, InStream *is);
114
- FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
115
- ullong fis_get_number(FieldInfos *fis, char *name);
116
- FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
117
- bool fis_reorder_required(FieldInfos *fis, Document *doc);
72
+ enum IndexValues
73
+ {
74
+ INDEX_NO = 0,
75
+ INDEX_YES = 1,
76
+ INDEX_UNTOKENIZED = 3,
77
+ INDEX_YES_OMIT_NORMS = 5,
78
+ INDEX_UNTOKENIZED_OMIT_NORMS = 7
79
+ };
118
80
 
119
- /****************************************************************************
120
- *
121
- * TermBuffer
122
- *
123
- ****************************************************************************/
81
+ enum TermVectorValues
82
+ {
83
+ TERM_VECTOR_NO = 0,
84
+ TERM_VECTOR_YES = 1,
85
+ TERM_VECTOR_WITH_POSITIONS = 3,
86
+ TERM_VECTOR_WITH_OFFSETS = 5,
87
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
88
+ };
124
89
 
125
- typedef struct TermBuffer {
126
- char *field;
127
- char text[MAX_WORD_SIZE];
128
- } TermBuffer;
90
+ #define FI_IS_STORED_BM 0x001
91
+ #define FI_IS_COMPRESSED_BM 0x002
92
+ #define FI_IS_INDEXED_BM 0x004
93
+ #define FI_IS_TOKENIZED_BM 0x008
94
+ #define FI_OMIT_NORMS_BM 0x010
95
+ #define FI_STORE_TERM_VECTOR_BM 0x020
96
+ #define FI_STORE_POSITIONS_BM 0x040
97
+ #define FI_STORE_OFFSETS_BM 0x080
98
+
99
+ typedef struct FieldInfo
100
+ {
101
+ char *name;
102
+ float boost;
103
+ unsigned int bits;
104
+ int number;
105
+ int ref_cnt;
106
+ } FieldInfo;
129
107
 
130
- TermBuffer *tb_create();
131
- void tb_destroy(TermBuffer *tb);
132
- TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
133
- Term *tb_get_term(TermBuffer *tb);
134
- int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
135
- int tb_term_cmp(TermBuffer *tb, Term *t);
136
- TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
137
- TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
108
+ extern FieldInfo *fi_new(const char *name,
109
+ enum StoreValues store,
110
+ enum IndexValues index,
111
+ enum TermVectorValues term_vector);
112
+ extern char *fi_to_s(FieldInfo *fi);
113
+ extern void fi_deref(FieldInfo *fi);
114
+
115
+ #define fi_is_stored(fi) (((fi)->bits & FI_IS_STORED_BM) != 0)
116
+ #define fi_is_compressed(fi) (((fi)->bits & FI_IS_COMPRESSED_BM) != 0)
117
+ #define fi_is_indexed(fi) (((fi)->bits & FI_IS_INDEXED_BM) != 0)
118
+ #define fi_is_tokenized(fi) (((fi)->bits & FI_IS_TOKENIZED_BM) != 0)
119
+ #define fi_omit_norms(fi) (((fi)->bits & FI_OMIT_NORMS_BM) != 0)
120
+ #define fi_store_term_vector(fi) (((fi)->bits & FI_STORE_TERM_VECTOR_BM) != 0)
121
+ #define fi_store_positions(fi) (((fi)->bits & FI_STORE_POSITIONS_BM) != 0)
122
+ #define fi_store_offsets(fi) (((fi)->bits & FI_STORE_OFFSETS_BM) != 0)
123
+ #define fi_has_norms(fi)\
124
+ (((fi)->bits & (FI_OMIT_NORMS_BM|FI_IS_INDEXED_BM)) == FI_IS_INDEXED_BM)
138
125
 
139
126
  /****************************************************************************
140
127
  *
141
- * TermInfo
128
+ * FieldInfos
142
129
  *
143
130
  ****************************************************************************/
144
131
 
145
- typedef struct TermInfo {
146
- int doc_freq;
147
- int freq_pointer;
148
- int prox_pointer;
149
- int skip_offset;
150
- } TermInfo;
132
+ #define FIELD_INFOS_INIT_CAPA 4
133
+ typedef struct FieldInfos
134
+ {
135
+ int store;
136
+ int index;
137
+ int term_vector;
138
+ int size;
139
+ int capa;
140
+ FieldInfo **fields;
141
+ HashTable *field_dict;
142
+ int ref_cnt;
143
+ } FieldInfos;
151
144
 
152
- TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
153
- TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
154
- void ti_destroy(TermInfo *ti);
155
- TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
156
- TermInfo *ti_clone(TermInfo *other);
157
- int ti_eq(TermInfo *ti1, TermInfo *ti2);
145
+ extern FieldInfos *fis_new(int store, int index, int term_vector);
146
+ extern FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi);
147
+ extern FieldInfo *fis_get_field(FieldInfos *fis, const char *name);
148
+ extern int fis_get_field_num(FieldInfos *fis, const char *name);
149
+ extern FieldInfo *fis_get_or_add_field(FieldInfos *fis, const char *name);
150
+ extern void fis_write(FieldInfos *fis, Store *store);
151
+ extern FieldInfos *fis_read(Store *store);
152
+ extern char *fis_to_s(FieldInfos *fis);
153
+ extern void fis_deref(FieldInfos *fis);
158
154
 
159
155
  /****************************************************************************
160
156
  *
161
- * TermEnum
157
+ * SegmentInfo
162
158
  *
163
159
  ****************************************************************************/
164
160
 
165
- typedef struct TermEnum TermEnum;
166
- struct TermEnum {
167
- void *data;
168
- TermBuffer *(*next)(TermEnum *te);
169
- void (*close)(TermEnum *te);
170
- TermEnum *(*clone)(TermEnum *te);
171
- TermBuffer *tb_curr;
172
- TermBuffer *tb_prev;
173
- TermInfo *ti_curr;
174
- };
175
-
176
- TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
177
-
178
- Term *te_get_term(struct TermEnum *te);
179
- TermInfo *te_get_ti(struct TermEnum *te);
180
-
181
- /* * SegmentTermEnum * */
182
-
183
- typedef struct SegmentTermEnum {
184
- FieldInfos *fis;
185
- int is_index;
186
- InStream *is;
187
- int size;
188
- int pos;
189
- int index_pointer;
190
- int index_interval;
191
- int skip_interval;
192
- int format_m1skip_interval;
193
- int format;
194
- } SegmentTermEnum;
195
-
196
-
197
- TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
198
- TermBuffer *ste_next(struct TermEnum *te);
199
- void ste_close(struct TermEnum *te);
200
-
201
- /* * MultiTermEnum * */
161
+ #define SEGMENT_NAME_MAX_LENGTH 100
202
162
 
203
- typedef struct MultiTermEnum {
204
- int doc_freq;
205
- PriorityQueue *smi_queue;
206
- } MultiTermEnum;
163
+ typedef struct SegmentInfo
164
+ {
165
+ char *name;
166
+ int doc_cnt;
167
+ Store *store;
168
+ } SegmentInfo;
207
169
 
208
- TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
170
+ extern SegmentInfo *si_new(char *name, int doc_cnt, Store *store);
171
+ extern void si_destroy(SegmentInfo *si);
172
+ extern bool si_has_deletions(SegmentInfo *si);
173
+ extern bool si_uses_compound_file(SegmentInfo *si);
174
+ extern bool si_has_separate_norms(SegmentInfo *si);
209
175
 
210
176
  /****************************************************************************
211
177
  *
212
- * TermInfosWriter
178
+ * SegmentInfos
213
179
  *
214
180
  ****************************************************************************/
215
181
 
216
- #define TERM_INFO_FORMAT -2
217
-
218
- typedef struct TermInfosWriter {
219
- int index_interval;
220
- int skip_interval;
221
- int size;
222
- int last_index_pointer;
223
- bool is_index;
224
- OutStream *os;
225
- struct TermInfosWriter *other;
226
- Term *last_term;
227
- TermInfo *last_term_info;
228
- FieldInfos *fis;
229
- char *curr_field;
230
- ullong curr_field_num;
231
- } TermInfosWriter;
182
+ typedef struct SegmentInfos
183
+ {
184
+ f_u64 counter;
185
+ f_u64 version;
186
+ f_u32 format;
187
+ Store *store;
188
+ SegmentInfo **segs;
189
+ int size;
190
+ int capa;
191
+ } SegmentInfos;
232
192
 
233
- TermInfosWriter *tiw_open(Store *store,
234
- char *segment,
235
- FieldInfos *fis,
236
- int interval);
237
- void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
238
- void tiw_close(TermInfosWriter *tiw);
193
+ extern SegmentInfos *sis_new();
194
+ extern SegmentInfo *sis_new_segment(SegmentInfos *sis, int dcnt, Store *store);
195
+ extern SegmentInfo *sis_add_si(SegmentInfos *sis, SegmentInfo *si);
196
+ extern void sis_del_at(SegmentInfos *sis, int at);
197
+ extern void sis_del_from_to(SegmentInfos *sis, int from, int to);
198
+ extern void sis_clear(SegmentInfos *sis);
199
+ extern SegmentInfos *sis_read(Store *store);
200
+ extern void sis_write(SegmentInfos *sis, Store *store);
201
+ extern f_u64 sis_read_current_version(Store *store);
202
+ extern void sis_destroy(SegmentInfos *sis);
239
203
 
240
204
  /****************************************************************************
241
205
  *
242
- * TermInfosReader
206
+ * TermInfo
243
207
  *
244
208
  ****************************************************************************/
245
209
 
246
- typedef struct TermInfosReader {
247
- mutex_t mutex;
248
- TermEnum *orig_te;
249
- thread_key_t thread_te;
250
- Array *te_bucket;
251
- TermEnum *index_te;
252
- int size;
253
- int skip_interval;
254
- int index_size;
255
- Term **index_terms;
256
- TermInfo **index_term_infos;
257
- int *index_pointers;
258
- } TermInfosReader;
210
+ typedef struct TermInfo
211
+ {
212
+ int doc_freq;
213
+ off_t frq_ptr;
214
+ off_t prx_ptr;
215
+ off_t skip_offset;
216
+ } TermInfo;
259
217
 
260
- TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
261
- void tir_close(TermInfosReader *tir);
262
- Term *tir_get_term(TermInfosReader *tir, int position);
263
- int tir_get_term_pos(TermInfosReader *tir, Term *t);
264
- TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
218
+ #define ti_set(ti, mdf, mfp, mpp, mso) do {\
219
+ (ti).doc_freq = mdf;\
220
+ (ti).frq_ptr = mfp;\
221
+ (ti).prx_ptr = mpp;\
222
+ (ti).skip_offset = mso;\
223
+ } while (0)
265
224
 
266
225
  /****************************************************************************
267
226
  *
268
- * TVOffsetInfo
227
+ * TermEnum
269
228
  *
270
229
  ****************************************************************************/
271
230
 
272
- typedef struct TVOffsetInfo {
273
- int start;
274
- int end;
275
- } TVOffsetInfo;
276
-
277
- TVOffsetInfo *tvoi_create(int start, int end);
278
- void tvoi_destroy(void *p);
279
-
280
- /****************************************************************************
281
- *
282
- * TVField
283
- *
284
- ****************************************************************************/
231
+ typedef struct TermEnum TermEnum;
285
232
 
286
- typedef struct TVField {
287
- int tvf_pointer;
288
- int number;
289
- unsigned int store_positions : 1;
290
- unsigned int store_offsets : 1;
291
- } TVField;
233
+ struct TermEnum
234
+ {
235
+ char curr_term[MAX_WORD_SIZE];
236
+ char prev_term[MAX_WORD_SIZE];
237
+ TermInfo curr_ti;
238
+ int curr_term_len;
239
+ int field_num;
240
+ TermEnum *(*set_field)(TermEnum *te, int field_num);
241
+ char *(*next)(TermEnum *te);
242
+ char *(*skip_to)(TermEnum *te, const char *term);
243
+ void (*close)(TermEnum *te);
244
+ TermEnum *(*clone)(TermEnum *te);
245
+ };
292
246
 
293
- TVField *tvf_create(int number, int store_positions, int store_offsets);
294
- void tvf_destroy(void *p);
247
+ char *te_get_term(struct TermEnum *te);
248
+ TermInfo *te_get_ti(struct TermEnum *te);
295
249
 
296
250
  /****************************************************************************
297
251
  *
298
- * TVTerm
252
+ * SegmentTermEnum
299
253
  *
300
254
  ****************************************************************************/
301
255
 
302
- typedef struct TVTerm {
303
- int field_num;
304
- char *text;
305
- int freq;
306
- int *positions;
307
- TVOffsetInfo **offsets;
308
- } TVTerm;
256
+ /* * SegmentTermIndex * */
309
257
 
310
- TVTerm *tvt_create(char *text,
311
- int freq,
312
- int *positions,
313
- TVOffsetInfo **offsets);
314
- void tvt_destroy(void *p);
258
+ typedef struct SegmentTermIndex
259
+ {
260
+ off_t index_ptr;
261
+ off_t ptr;
262
+ int index_size;
263
+ int size;
264
+ char **index_terms;
265
+ int *index_term_lens;
266
+ TermInfo *index_term_infos;
267
+ off_t *index_ptrs;
268
+ } SegmentTermIndex;
315
269
 
316
- /****************************************************************************
317
- *
318
- * TermVector
319
- *
320
- ****************************************************************************/
270
+ /* * SegmentFieldIndex * */
321
271
 
322
- typedef struct TermVector {
323
- char *field;
324
- char **terms;
325
- int tcnt;
326
- int *freqs;
327
- int **positions;
328
- TVOffsetInfo ***offsets;
329
- } TermVector;
272
+ typedef struct SegmentTermEnum SegmentTermEnum;
330
273
 
331
- TermVector *tv_create(const char *field,
332
- char **terms,
333
- int tcnt,
334
- int *freqs,
335
- int **positions,
336
- TVOffsetInfo ***offsets);
337
- void tv_destroy(TermVector *tv);
274
+ typedef struct SegmentFieldIndex
275
+ {
276
+ mutex_t mutex;
277
+ int skip_interval;
278
+ int index_interval;
279
+ off_t index_ptr;
280
+ TermEnum *index_te;
281
+ HashTable *field_dict;
282
+ } SegmentFieldIndex;
338
283
 
339
- /****************************************************************************
340
- *
341
- * TermVectorsWriter
342
- *
343
- ****************************************************************************/
284
+ extern SegmentFieldIndex *sfi_open(Store *store, const char *segment);
285
+ extern void sfi_close(SegmentFieldIndex *sfi);
344
286
 
345
- #define STORE_POSITIONS_WITH_TERMVECTOR 0x1
346
- #define STORE_OFFSET_WITH_TERMVECTOR 0x2
347
287
 
348
- #define FORMAT_VERSION 2
349
- #define FORMAT_SIZE 4
288
+ /* * SegmentTermEnum * */
289
+ struct SegmentTermEnum
290
+ {
291
+ TermEnum te;
292
+ InStream *is;
293
+ int size;
294
+ int pos;
295
+ int skip_interval;
296
+ SegmentFieldIndex *sfi;
297
+ };
350
298
 
351
- #define TVX_EXTENSION ".tvx"
352
- #define TVD_EXTENSION ".tvd"
353
- #define TVF_EXTENSION ".tvf"
299
+ extern void ste_close(TermEnum *te);
300
+ extern TermEnum *ste_clone(TermEnum *te);
301
+ extern TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi);
354
302
 
355
- typedef struct TermVectorsWriter {
356
- TVField *curr_field;
357
- int curr_doc_pointer;
358
- OutStream *tvx;
359
- OutStream *tvd;
360
- OutStream *tvf;
361
- FieldInfos *fis;
362
- TVField **fields;
363
- int fcnt;
364
- int fsize;
365
- TVTerm **terms;
366
- int tcnt;
367
- int tsize;
368
- } TermVectorsWriter;
369
-
370
- TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
371
- void tvw_close(TermVectorsWriter *tvw);
372
- void tvw_open_doc(TermVectorsWriter *tvw);
373
- void tvw_close_doc(TermVectorsWriter *tvw);
374
- void tvw_open_field(TermVectorsWriter *tvw, char *field);
375
- void tvw_close_field(TermVectorsWriter *tvw);
376
- void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
377
- void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
303
+ /* * MultiTermEnum * */
378
304
 
305
+ extern TermEnum *mte_new(MultiReader *mr, int field_num, const char *term);
379
306
 
380
307
  /****************************************************************************
381
308
  *
382
- * TermVectorsReader
309
+ * TermInfosReader
383
310
  *
384
311
  ****************************************************************************/
385
312
 
386
- typedef struct TermVectorsReader {
387
- int size;
388
- InStream *tvx;
389
- InStream *tvd;
390
- InStream *tvf;
391
- FieldInfos *fis;
392
- int tvd_format;
393
- int tvf_format;
394
- } TermVectorsReader;
395
-
396
- TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
397
- TermVectorsReader *tvr_clone(TermVectorsReader *orig);
398
- void tvr_close(TermVectorsReader *tvr);
399
- TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
400
- char *field, int tvf_pointer);
401
- Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
402
- TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
313
+ #define TE_BUCKET_INIT_CAPA 1
403
314
 
404
- /****************************************************************************
405
- *
406
- * FieldsReader
407
- *
408
- ****************************************************************************/
409
-
410
- typedef struct FieldsReader {
411
- int len;
412
- FieldInfos *fis;
413
- InStream *fields_in;
414
- InStream *index_in;
415
- } FieldsReader;
315
+ typedef struct TermInfosReader
316
+ {
317
+ thread_key_t thread_te;
318
+ void **te_bucket;
319
+ TermEnum *orig_te;
320
+ int field_num;
321
+ } TermInfosReader;
416
322
 
417
- FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
418
- void fr_close(FieldsReader *fr);
419
- Document *fr_get_doc(FieldsReader *fr, int doc_num);
323
+ extern TermInfosReader *tir_open(Store *store,
324
+ SegmentFieldIndex *sfi,
325
+ const char *segment);
326
+ extern TermInfosReader *tir_set_field(TermInfosReader *tir, int field_num);
327
+ extern TermInfo *tir_get_ti(TermInfosReader *tir, const char *term);
328
+ extern char *tir_get_term(TermInfosReader *tir, int pos);
329
+ extern void tir_close(TermInfosReader *tir);
420
330
 
421
331
  /****************************************************************************
422
332
  *
423
- * FieldsWriter
333
+ * TermInfosWriter
424
334
  *
425
335
  ****************************************************************************/
426
336
 
427
- #define FIELD_IS_TOKENIZED 0X1
428
- #define FIELD_IS_BINARY 0X2
429
- #define FIELD_IS_COMPRESSED 0X4
337
+ #define INDEX_INTERVAL 128
338
+ #define SKIP_INTERVAL 16
430
339
 
431
- typedef struct FieldsWriter {
432
- FieldInfos *fis;
433
- OutStream *fields_out;
434
- OutStream *index_out;
435
- } FieldsWriter;
340
+ typedef struct TermWriter
341
+ {
342
+ int counter;
343
+ const char *last_term;
344
+ TermInfo last_term_info;
345
+ OutStream *os;
346
+ } TermWriter;
347
+
348
+ typedef struct TermInfosWriter
349
+ {
350
+ int field_count;
351
+ int index_interval;
352
+ int skip_interval;
353
+ off_t last_index_ptr;
354
+ OutStream *tfx_out;
355
+ TermWriter *tix_writer;
356
+ TermWriter *tis_writer;
357
+ } TermInfosWriter;
436
358
 
437
- FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
438
- void fw_close(FieldsWriter *fw);
439
- void fw_add_doc(FieldsWriter *fw, Document *doc);
359
+ extern TermInfosWriter *tiw_open(Store *store,
360
+ const char *segment,
361
+ int index_interval,
362
+ int skip_interval);
363
+ extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
364
+ extern void tiw_add(TermInfosWriter *tiw,
365
+ const char *term,
366
+ int t_len,
367
+ TermInfo *ti);
368
+ extern void tiw_close(TermInfosWriter *tiw);
440
369
 
441
370
  /****************************************************************************
442
371
  *
@@ -445,9 +374,10 @@ void fw_add_doc(FieldsWriter *fw, Document *doc);
445
374
  ****************************************************************************/
446
375
 
447
376
  typedef struct TermDocEnum TermDocEnum;
448
- struct TermDocEnum {
449
- void *data;
450
- void (*seek)(TermDocEnum *tde, Term *term);
377
+ struct TermDocEnum
378
+ {
379
+ void (*seek)(TermDocEnum *tde, int field_num, const char *term);
380
+ void (*seek_te)(TermDocEnum *tde, TermEnum *te);
451
381
  int (*doc_num)(TermDocEnum *tde);
452
382
  int (*freq)(TermDocEnum *tde);
453
383
  bool (*next)(TermDocEnum *tde);
@@ -460,71 +390,72 @@ struct TermDocEnum {
460
390
  /* * SegmentTermDocEnum * */
461
391
 
462
392
  typedef struct SegmentTermDocEnum SegmentTermDocEnum;
463
-
464
- struct SegmentTermDocEnum {
465
- SegmentReader *parent;
466
- InStream *freq_in;
393
+ struct SegmentTermDocEnum
394
+ {
395
+ TermDocEnum tde;
396
+ void (*seek_prox)(SegmentTermDocEnum *stde, int prx_ptr);
397
+ void (*skip_prox)(SegmentTermDocEnum *stde);
398
+ TermInfosReader *tir;
399
+ InStream *frq_in;
400
+ InStream *prx_in;
401
+ InStream *skip_in;
402
+ BitVector *deleted_docs;
467
403
  int count; /* number of docs for this term skipped */
468
404
  int doc_freq; /* number of doc this term appears in */
469
- BitVector *deleted_docs;
470
405
  int doc_num;
471
406
  int freq;
472
- int skip_interval;
473
407
  int num_skips;
408
+ int skip_interval;
474
409
  int skip_count;
475
- InStream *skip_in;
476
410
  int skip_doc;
477
- int freq_pointer;
478
- int prox_pointer;
479
- int skip_pointer;
480
- unsigned int have_skipped : 1;
481
- void (*skip_prox)(SegmentTermDocEnum *stde);
482
- InStream *prox_in;
483
- int prox_cnt;
411
+ int frq_ptr;
412
+ int prx_ptr;
413
+ int skip_ptr;
414
+ int prx_cnt;
484
415
  int position;
485
- void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
416
+ bool have_skipped : 1;
486
417
  };
487
418
 
488
- TermDocEnum *stde_create(IndexReader *ir);
489
- void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
419
+ extern TermDocEnum *stde_new(TermInfosReader *tir, InStream *frq_in,
420
+ BitVector *deleted_docs, int skip_interval);
490
421
 
491
- /* * SegmentTermPosEnum * */
492
- TermDocEnum *stpe_create(IndexReader *ir);
493
-
494
- /* * MultiTermDocEnum * */
495
- typedef struct MultiTermDocEnum MultiTermDocEnum;
496
- struct MultiTermDocEnum {
497
- IndexReader **irs;
498
- int *starts;
499
- int ir_cnt;
500
- Term *term;
501
- int base;
502
- int pointer;
503
- TermDocEnum **irs_tde;
504
- TermDocEnum *curr_tde;
505
- TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
506
- };
422
+ /* * SegmentTermDocEnum * */
423
+ extern TermDocEnum *stpe_new(TermInfosReader *tir, InStream *frq_in,
424
+ InStream *prx_in, BitVector *deleted_docs,
425
+ int skip_interval);
507
426
 
508
- TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
427
+ /****************************************************************************
428
+ * MultipleTermDocPosEnum
429
+ ****************************************************************************/
509
430
 
510
- /* * MultiTermPosEnum * */
511
- TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
431
+ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
432
+ int t_cnt);
512
433
 
513
434
  /****************************************************************************
514
- * MultipleTermDocPosEnum
435
+ *
436
+ * Offset
437
+ *
515
438
  ****************************************************************************/
516
439
 
517
- #define MTDPE_POS_QUEUE_INIT_CAPA 8
518
- typedef struct {
519
- int doc_num;
520
- int freq;
521
- PriorityQueue *pq;
522
- int *pos_queue;
523
- int pos_queue_index;
524
- int pos_queue_capa;
525
- } MultipleTermDocPosEnum;
440
+ typedef struct Offset
441
+ {
442
+ int start;
443
+ int end;
444
+ } Offset;
526
445
 
527
- TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
446
+ extern Offset *offset_new(int start, int end);
447
+
448
+ /****************************************************************************
449
+ *
450
+ * Occurence
451
+ *
452
+ ****************************************************************************/
453
+
454
+ typedef struct Occurence
455
+ {
456
+ struct Occurence *next;
457
+ int pos;
458
+ } Occurence;
528
459
 
529
460
  /****************************************************************************
530
461
  *
@@ -532,283 +463,388 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
532
463
  *
533
464
  ****************************************************************************/
534
465
 
535
- typedef struct Posting {
536
- Term *term;
466
+ typedef struct Posting
467
+ {
537
468
  int freq;
538
- int size;
539
- int *positions;
540
- TVOffsetInfo **offsets;
469
+ int doc_num;
470
+ Occurence *first_occ;
471
+ struct Posting *next;
541
472
  } Posting;
542
473
 
543
- Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
544
- void p_destroy(Posting *self);
545
- void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
474
+ extern inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
546
475
 
476
+ /****************************************************************************
477
+ *
478
+ * PostingList
479
+ *
480
+ ****************************************************************************/
481
+
482
+ typedef struct PostingList
483
+ {
484
+ const char *term;
485
+ int term_len;
486
+ Posting *first;
487
+ Posting *last;
488
+ Occurence *last_occ;
489
+ } PostingList;
490
+
491
+ extern PostingList *pl_new(MemoryPool *mp, const char *term,
492
+ int term_len, Posting *p);
493
+ extern void pl_add_occ(MemoryPool *mp, PostingList *pl, int pos);
547
494
 
548
495
  /****************************************************************************
549
496
  *
550
- * DocumentWriter
497
+ * TVField
551
498
  *
552
499
  ****************************************************************************/
553
500
 
554
- typedef struct DocumentWriter {
555
- Store *store;
556
- Analyzer *analyzer;
557
- Similarity *similarity;
558
- HshTable *postingtable;
559
- int pcnt;
560
- FieldInfos *fis;
561
- float *field_boosts;
562
- int *field_lengths;
563
- int *field_positions;
564
- int *field_offsets;
565
- int max_field_length;
566
- int term_index_interval;
567
- } DocumentWriter;
501
+ typedef struct TVField
502
+ {
503
+ int field_num;
504
+ int size;
505
+ } TVField;
506
+
507
+ /****************************************************************************
508
+ *
509
+ * TVTerm
510
+ *
511
+ ****************************************************************************/
568
512
 
569
- DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
570
- Similarity *similarity, int max_field_length, int term_index_interval);
571
- void dw_close(DocumentWriter *dw);
572
- void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
513
+ typedef struct TVTerm
514
+ {
515
+ char *text;
516
+ int freq;
517
+ int *positions;
518
+ } TVTerm;
573
519
 
574
520
  /****************************************************************************
575
521
  *
576
- * SegmentInfo
522
+ * TermVector
577
523
  *
578
524
  ****************************************************************************/
579
525
 
580
- typedef struct SegmentInfo {
581
- char *name;
582
- int doc_cnt;
583
- Store *store;
584
- } SegmentInfo;
526
+ typedef struct TermVector
527
+ {
528
+ int field_num;
529
+ char *field;
530
+ int term_cnt;
531
+ TVTerm *terms;
532
+ int offset_cnt;
533
+ Offset *offsets;
534
+ } TermVector;
585
535
 
586
- SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
587
- void si_destroy(SegmentInfo *si);
588
- bool si_has_deletions(SegmentInfo *si);
589
- bool si_uses_compound_file(SegmentInfo *si);
590
- bool si_has_separate_norms(SegmentInfo *si);
536
+ extern void tv_destroy(TermVector *tv);
537
+ extern int tv_get_tv_term_index(TermVector *tv, const char *term);
538
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term);
591
539
 
592
540
  /****************************************************************************
593
541
  *
594
- * SegmentInfos
542
+ * TermVectorsWriter
595
543
  *
596
544
  ****************************************************************************/
597
545
 
598
- typedef struct SegmentInfos {
599
- Store *store;
600
- SegmentInfo **segs;
601
- int scnt;
602
- int size;
603
- int counter;
604
- int version;
605
- int format;
606
- } SegmentInfos;
546
+ #define TV_FIELD_INIT_CAPA 8
547
+
548
+ typedef struct TermVectorsWriter
549
+ {
550
+ OutStream *tvx_out;
551
+ OutStream *tvd_out;
552
+ FieldInfos *fis;
553
+ TVField *fields;
554
+ off_t tvd_ptr;
555
+ } TermVectorsWriter;
607
556
 
608
- SegmentInfos *sis_create();
609
- void sis_destroy(SegmentInfos *sis);
610
- void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
611
- void sis_del_at(SegmentInfos *sis, int at);
612
- void sis_del_from_to(SegmentInfos *sis, int from, int to);
613
- void sis_clear(SegmentInfos *sis);
614
- void sis_read(SegmentInfos *sis, Store *store);
615
- void sis_write(SegmentInfos *sis, Store *store);
616
- int sis_read_current_version(Store *store);
557
+ extern TermVectorsWriter *tvw_open(Store *store,
558
+ const char *segment,
559
+ FieldInfos *fis);
560
+ extern void tvw_open_doc(TermVectorsWriter *tvw);
561
+ extern void tvw_close_doc(TermVectorsWriter *tvw);
562
+ extern void tvw_add_postings(TermVectorsWriter *tvw,
563
+ int field_num,
564
+ PostingList **plists,
565
+ int posting_count,
566
+ Offset *offsets,
567
+ int offset_count);
568
+ extern void tvw_close(TermVectorsWriter *tvw);
617
569
 
618
570
  /****************************************************************************
619
571
  *
620
- * IndexReader
572
+ * TermVectorsReader
621
573
  *
622
574
  ****************************************************************************/
623
575
 
624
- enum FIELD_TYPE {
625
- /* all fields */
626
- IR_ALL,
627
- /* all indexed fields */
628
- IR_INDEXED,
629
- /* all fields which are not indexed */
630
- IR_UNINDEXED,
631
- /* all fields which are indexed with termvectors enables */
632
- IR_INDEXED_WITH_TERM_VECTOR,
633
- /* all fields which are indexed but don't have termvectors enabled */
634
- IR_INDEXED_NO_TERM_VECTOR,
635
- /* all fields where termvectors are enabled. Please note that only standard */
636
- /* termvector fields are returned */
637
- IR_TERM_VECTOR,
638
- /* all field with termvectors wiht positions enabled */
639
- IR_TERM_VECTOR_WITH_POSITION,
640
- /* all fields where termvectors with offset position are set */
641
- IR_TERM_VECTOR_WITH_OFFSET,
642
- /* all fields where termvectors with offset and position values set */
643
- IR_TERM_VECTOR_WITH_POSITION_OFFSET
644
- };
576
+ typedef struct TermVectorsReader
577
+ {
578
+ int size;
579
+ InStream *tvx_in;
580
+ InStream *tvd_in;
581
+ FieldInfos *fis;
582
+ } TermVectorsReader;
645
583
 
646
- struct IndexReader {
647
- mutex_t mutex;
648
- HshTable *cache;
649
- HshTable *sort_cache;
650
- void *data;
651
- Store *store;
652
- Lock *write_lock;
653
- SegmentInfos *sis;
654
- bool has_changes : 1;
655
- bool is_stale : 1;
656
- bool is_owner : 1;
657
- TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
658
- Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
659
- int (*num_docs)(IndexReader *ir);
660
- int (*max_doc)(IndexReader *ir);
661
- Document *(*get_doc)(IndexReader *ir, int doc_num);
662
- uchar *(*get_norms)(IndexReader *ir, char *field);
663
- uchar *(*get_norms_always)(IndexReader *ir, char *field);
664
- void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
665
- uchar val);
666
- void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf,
667
- int offset);
668
- TermEnum *(*terms)(IndexReader *ir);
669
- TermEnum *(*terms_from)(IndexReader *ir, Term *term);
670
- int (*doc_freq)(IndexReader *ir, Term *t);
671
- TermDocEnum *(*term_docs)(IndexReader *ir);
672
- TermDocEnum *(*term_positions)(IndexReader *ir);
673
- void (*do_delete_doc)(IndexReader *ir, int doc_num);
674
- void (*do_undelete_all)(IndexReader *ir);
675
- bool (*is_deleted)(IndexReader *ir, int doc_num);
676
- bool (*has_deletions)(IndexReader *ir);
677
- bool (*has_norms)(IndexReader *ir, char *field);
678
- HashSet *(*get_field_names)(IndexReader *ir, int field_type);
679
- void (*do_commit)(IndexReader *ir);
680
- void (*do_close)(IndexReader *ir);
681
- void (*acquire_write_lock)(IndexReader *ir);
584
+ extern TermVectorsReader *tvr_open(Store *store,
585
+ const char *segment,
586
+ FieldInfos *fis);
587
+ extern TermVectorsReader *tvr_clone(TermVectorsReader *orig);
588
+ extern void tvr_close(TermVectorsReader *tvr);
589
+ extern HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
590
+ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
591
+ int doc_num,
592
+ int field_num);
593
+
594
+ /****************************************************************************
595
+ *
596
+ * LazyDoc
597
+ *
598
+ ****************************************************************************/
599
+
600
+ /* * * LazyDocField * * */
601
+ typedef struct LazyDocFieldData
602
+ {
603
+ int start;
604
+ int length;
605
+ char *text;
606
+ } LazyDocFieldData;
607
+
608
+ typedef struct LazyDoc LazyDoc;
609
+ typedef struct LazyDocField
610
+ {
611
+ char *name;
612
+ int size; /* number of data elements */
613
+ LazyDocFieldData *data;
614
+ int len; /* length of data elements concatenated */
615
+ LazyDoc *doc;
616
+ } LazyDocField;
617
+
618
+ extern char *lazy_df_get_data(LazyDocField *self, int i);
619
+ extern void lazy_df_get_bytes(LazyDocField *self, char *buf,
620
+ int start, int len);
621
+
622
+ /* * * LazyDoc * * */
623
+ struct LazyDoc
624
+ {
625
+ HashTable *field_dict;
626
+ int size;
627
+ LazyDocField **fields;
628
+ InStream *fields_in;
682
629
  };
683
630
 
684
- IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
685
- IndexReader *ir_open(Store *store);
686
- bool ir_index_exists(Store *store);
687
- void ir_close(IndexReader *ir);
688
- void ir_commit(IndexReader *ir);
689
- void ir_delete_doc(IndexReader *ir, int doc_num);
690
- void ir_undelete_all(IndexReader *ir);
691
- void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
692
- void ir_destroy(IndexReader *self);
693
- Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
694
- TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
695
- TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
696
- void ir_add_cache(IndexReader *ir);
697
- bool ir_is_latest(IndexReader *ir);
631
+ extern void lazy_doc_close(LazyDoc *self);
698
632
 
699
633
  /****************************************************************************
700
634
  *
701
- * Norm
635
+ * FieldsReader
702
636
  *
703
637
  ****************************************************************************/
704
638
 
705
- typedef struct Norm {
706
- bool is_dirty : 1;
707
- int field_num;
708
- InStream *is;
709
- uchar *bytes;
710
- } Norm;
639
+ typedef struct FieldsReader
640
+ {
641
+ int size;
642
+ FieldInfos *fis;
643
+ Store *store;
644
+ InStream *fdx_in;
645
+ InStream *fdt_in;
646
+ } FieldsReader;
647
+
648
+ extern FieldsReader *fr_open(Store *store,
649
+ const char *segment, FieldInfos *fis);
650
+ extern FieldsReader *fr_clone(FieldsReader *orig);
651
+ extern void fr_close(FieldsReader *fr);
652
+ extern Document *fr_get_doc(FieldsReader *fr, int doc_num);
653
+ extern LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num);
654
+ extern HashTable *fr_get_tv(FieldsReader *fr, int doc_num);
655
+ extern TermVector *fr_get_field_tv(FieldsReader *fr, int doc_num,
656
+ int field_num);
711
657
 
712
658
  /****************************************************************************
713
659
  *
714
- * SegmentReader
660
+ * FieldsWriter
715
661
  *
716
662
  ****************************************************************************/
717
663
 
718
- struct SegmentReader {
664
+ typedef struct FieldsWriter
665
+ {
719
666
  FieldInfos *fis;
720
- FieldsReader *fr;
721
- char *segment;
722
- BitVector *deleted_docs;
723
- bool deleted_docs_dirty : 1;
724
- bool undelete_all : 1;
725
- bool norms_dirty : 1;
726
- InStream *freq_in;
727
- InStream *prox_in;
728
- TermInfosReader *tir;
729
- TermVectorsReader *orig_tvr;
730
- thread_key_t thread_tvr;
731
- Array *tvr_bucket;
732
- HshTable *norms;
733
- Store *cfs_store;
734
- uchar *fake_norms;
735
- };
667
+ OutStream *fdt_out;
668
+ OutStream *fdx_out;
669
+ TVField *tv_fields;
670
+ off_t start_ptr;
671
+ } FieldsWriter;
736
672
 
737
- IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner);
738
- IndexReader *sr_open_si(SegmentInfo *si);
673
+ extern FieldsWriter *fw_open(Store *store,
674
+ const char *segment, FieldInfos *fis);
675
+ extern void fw_close(FieldsWriter *fw);
676
+ extern void fw_add_doc(FieldsWriter *fw, Document *doc);
677
+ extern void fw_add_postings(FieldsWriter *fw,
678
+ int field_num,
679
+ PostingList **plists,
680
+ int posting_count,
681
+ Offset *offsets,
682
+ int offset_count);
683
+ extern void fw_write_tv_index(FieldsWriter *fw);
739
684
 
740
685
  /****************************************************************************
741
686
  *
742
- * MultiReader
687
+ * IndexReader
743
688
  *
744
689
  ****************************************************************************/
745
690
 
746
- typedef struct MultiReader {
747
- bool has_deletions : 1;
691
+ #define WRITE_LOCK_NAME "write"
692
+ #define COMMIT_LOCK_NAME "commit"
693
+
694
+ struct IndexReader
695
+ {
696
+ int (*num_docs)(IndexReader *ir);
697
+ int (*max_doc)(IndexReader *ir);
698
+ Document *(*get_doc)(IndexReader *ir, int doc_num);
699
+ LazyDoc *(*get_lazy_doc)(IndexReader *ir, int doc_num);
700
+ uchar *(*get_norms)(IndexReader *ir, int field_num);
701
+ uchar *(*get_norms_into)(IndexReader *ir, int field_num,
702
+ uchar *buf);
703
+ TermEnum *(*terms)(IndexReader *ir, int field_num);
704
+ TermEnum *(*terms_from)(IndexReader *ir, int field_num,
705
+ const char *term);
706
+ int (*doc_freq)(IndexReader *ir, int field_num,
707
+ const char *term);
708
+ TermDocEnum *(*term_docs)(IndexReader *ir);
709
+ TermDocEnum *(*term_positions)(IndexReader *ir);
710
+ TermVector *(*term_vector)(IndexReader *ir, int doc_num,
711
+ const char *field);
712
+ HashTable *(*term_vectors)(IndexReader *ir, int doc_num);
713
+ bool (*is_deleted)(IndexReader *ir, int doc_num);
714
+ bool (*has_deletions)(IndexReader *ir);
715
+ void (*acquire_write_lock)(IndexReader *ir);
716
+ void (*set_norm_i)(IndexReader *ir, int doc_num, int field_num,
717
+ uchar val);
718
+ void (*delete_doc_i)(IndexReader *ir, int doc_num);
719
+ void (*undelete_all_i)(IndexReader *ir);
720
+ void (*commit_i)(IndexReader *ir);
721
+ void (*close_i)(IndexReader *ir);
722
+ int ref_cnt;
723
+ Store *store;
724
+ Lock *write_lock;
725
+ SegmentInfos *sis;
726
+ FieldInfos *fis;
727
+ HashTable *cache;
728
+ HashTable *sort_cache;
729
+ uchar *fake_norms;
730
+ mutex_t mutex;
731
+ bool has_changes : 1;
732
+ bool is_stale : 1;
733
+ bool is_owner : 1;
734
+ };
735
+
736
+ extern IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
737
+ extern IndexReader *ir_open(Store *store);
738
+ extern int ir_get_field_num(IndexReader *ir, const char *field);
739
+ extern bool ir_index_exists(Store *store);
740
+ extern void ir_close(IndexReader *ir);
741
+ extern void ir_commit(IndexReader *ir);
742
+ extern void ir_delete_doc(IndexReader *ir, int doc_num);
743
+ extern void ir_undelete_all(IndexReader *ir);
744
+ extern int ir_doc_freq(IndexReader *ir, const char *field, const char *term);
745
+ extern void ir_set_norm(IndexReader *ir, int doc_num, const char *field,
746
+ uchar val);
747
+ extern uchar *ir_get_norms(IndexReader *ir, const char *field);
748
+ extern uchar *ir_get_norms_into(IndexReader *ir, const char *field, uchar *buf);
749
+ extern void ir_destroy(IndexReader *self);
750
+ extern Document *ir_get_doc_with_term(IndexReader *ir, const char *field,
751
+ const char *term);
752
+ extern TermEnum *ir_terms(IndexReader *ir, const char *field);
753
+ extern TermEnum *ir_terms_from(IndexReader *ir, const char *field,
754
+ const char *t);
755
+ extern TermDocEnum *ir_term_docs_for(IndexReader *ir, const char *field,
756
+ const char *term);
757
+ extern TermDocEnum *ir_term_positions_for(IndexReader *ir, const char *fld,
758
+ const char *t);
759
+ extern void ir_add_cache(IndexReader *ir);
760
+ extern bool ir_is_latest(IndexReader *ir);
761
+
762
+ /****************************************************************************
763
+ * MultiReader
764
+ ****************************************************************************/
765
+
766
+ struct MultiReader {
767
+ IndexReader ir;
748
768
  int max_doc;
749
769
  int num_docs_cache;
750
- int rcnt;
770
+ int r_cnt;
751
771
  int *starts;
752
772
  IndexReader **sub_readers;
753
- HshTable *norms_cache;
754
- } MultiReader;
773
+ HashTable *norms_cache;
774
+ bool has_deletions : 1;
775
+ int **field_num_map;
776
+ };
777
+
778
+ extern int mr_get_field_num(MultiReader *mr, int ir_num, int f_num);
779
+ extern IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt);
755
780
 
756
- IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
757
- int rcnt);
758
781
 
759
782
  /****************************************************************************
760
783
  *
761
- * SegmentMergeInfo
784
+ * Boost
762
785
  *
763
786
  ****************************************************************************/
764
787
 
765
- typedef struct SegmentMergeInfo {
766
- int base;
767
- IndexReader *ir;
768
- TermEnum *te;
769
- TermBuffer *tb;
770
- TermDocEnum *postings;
771
- int *doc_map;
772
- } SegmentMergeInfo;
788
+ typedef struct Boost
789
+ {
790
+ float val;
791
+ int doc_num;
792
+ struct Boost *next;
793
+ } Boost;
773
794
 
774
- SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
775
- void smi_destroy(SegmentMergeInfo *smi);
776
- TermBuffer *smi_next(SegmentMergeInfo *smi);
777
- bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2);
795
+ /****************************************************************************
796
+ *
797
+ * FieldInverter
798
+ *
799
+ ****************************************************************************/
800
+
801
+ typedef struct FieldInverter
802
+ {
803
+ HashTable *plists;
804
+ uchar *norms;
805
+ FieldInfo *fi;
806
+ int length;
807
+ bool is_tokenized : 1;
808
+ bool store_term_vector : 1;
809
+ bool store_offsets : 1;
810
+ bool has_norms : 1;
811
+ } FieldInverter;
778
812
 
779
813
  /****************************************************************************
780
814
  *
781
- * SegmentMerger
815
+ * DocWriter
782
816
  *
783
817
  ****************************************************************************/
784
818
 
785
- typedef struct SegmentMerger {
819
+ #define DW_OFFSET_INIT_CAPA 512
820
+ typedef struct IndexWriter IndexWriter;
821
+
822
+ typedef struct DocWriter
823
+ {
786
824
  Store *store;
787
- char *name;
788
- Array *readers;
825
+ const char *segment;
789
826
  FieldInfos *fis;
790
- OutStream *freq_out;
791
- OutStream *prox_out;
792
- TermInfosWriter *tiw;
793
- Term *terms_buf;
794
- int terms_buf_pointer;
795
- int terms_buf_size;
796
- PriorityQueue *queue;
797
- TermInfo *ti;
798
- int term_index_interval;
799
- OutStream *skip_buffer;
827
+ TermVectorsWriter *tvw;
828
+ FieldsWriter *fw;
829
+ MemoryPool *mp;
830
+ Analyzer *analyzer;
831
+ HashTable *curr_plists;
832
+ HashTable *fields;
833
+ Similarity *similarity;
834
+ Offset *offsets;
835
+ int offsets_size;
836
+ int offsets_capa;
837
+ int doc_num;
838
+ int index_interval;
800
839
  int skip_interval;
801
- int last_skip_doc;
802
- int last_skip_freq_pointer;
803
- int last_skip_prox_pointer;
804
- } SegmentMerger;
805
-
806
- SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
807
- void sm_destroy(SegmentMerger *sm);
808
- void sm_add(SegmentMerger *sm, IndexReader *ir);
809
- int sm_merge(SegmentMerger *sm);
810
- Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
840
+ int max_field_length;
841
+ int max_buffered_docs;
842
+ } DocWriter;
811
843
 
844
+ extern DocWriter *dw_open(IndexWriter *is, const char *segment);
845
+ extern void dw_close(DocWriter *dw);
846
+ extern void dw_add_doc(DocWriter *dw, Document *doc);
847
+ extern void dw_new_segment(DocWriter *dw, char *segment);
812
848
 
813
849
  /****************************************************************************
814
850
  *
@@ -816,35 +852,38 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
816
852
  *
817
853
  ****************************************************************************/
818
854
 
819
- #define WRITE_LOCK_NAME "write"
820
- #define COMMIT_LOCK_NAME "commit"
821
- struct IndexWriter {
855
+ typedef struct DelTerm
856
+ {
857
+ int field_num;
858
+ char *term;
859
+ } DelTerm;
860
+
861
+ struct IndexWriter
862
+ {
863
+ Config config;
822
864
  mutex_t mutex;
823
- HshTable *postings;
824
- FieldInfos *fis;
825
- int merge_factor;
826
- int min_merge_docs;
827
- int max_merge_docs;
828
- int max_field_length;
829
- int term_index_interval;
830
865
  Store *store;
831
866
  Analyzer *analyzer;
832
- Similarity *similarity;
833
867
  SegmentInfos *sis;
834
- Store *ram_store;
868
+ FieldInfos *fis;
869
+ DocWriter *dw;
870
+ Similarity *similarity;
871
+ DelTerm **del_terms;
835
872
  Lock *write_lock;
836
- bool use_compound_file : 1;
837
873
  };
838
874
 
839
- IndexWriter *iw_open(Store *store, Analyzer *analyzer,
840
- bool create);
841
- void iw_flush_ram_segments(IndexWriter *iw);
842
- void iw_close(IndexWriter *iw);
843
- int iw_doc_count(IndexWriter *iw);
844
- void iw_add_doc(IndexWriter *iw, Document *doc);
845
- void iw_optimize(IndexWriter *iw);
846
- void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
847
- void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
875
+ extern void index_create(Store *store, FieldInfos *fis);
876
+ extern IndexWriter *iw_open(Store *store, Analyzer *analyzer,
877
+ const Config *config);
878
+ extern void iw_delete_term(IndexWriter *iw, const char *field,
879
+ const char *term);
880
+ extern void iw_close(IndexWriter *iw);
881
+ extern void iw_add_doc(IndexWriter *iw, Document *doc);
882
+ extern int iw_doc_count(IndexWriter *iw);
883
+ extern void iw_commit(IndexWriter *iw);
884
+ extern void iw_optimize(IndexWriter *iw);
885
+ extern void iw_add_readers(IndexWriter *iw, IndexReader **readers,
886
+ const int r_cnt);
848
887
 
849
888
  /****************************************************************************
850
889
  *
@@ -852,16 +891,24 @@ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
852
891
  *
853
892
  ****************************************************************************/
854
893
 
894
+ #define CW_INIT_CAPA 16
895
+ typedef struct CWFileEntry
896
+ {
897
+ char *name;
898
+ off_t dir_offset;
899
+ off_t data_offset;
900
+ } CWFileEntry;
901
+
855
902
  typedef struct CompoundWriter {
856
903
  Store *store;
857
904
  const char *name;
858
905
  HashSet *ids;
859
- Array *file_entries;
860
- bool merged;
906
+ CWFileEntry *file_entries;
861
907
  } CompoundWriter;
862
908
 
863
- CompoundWriter *open_cw(Store *store, char *name);
864
- void cw_add_file(CompoundWriter *cw, char *id);
865
- void cw_close(CompoundWriter *cw);
909
+ extern CompoundWriter *open_cw(Store *store, char *name);
910
+ extern void cw_add_file(CompoundWriter *cw, char *id);
911
+ extern void cw_close(CompoundWriter *cw);
912
+
866
913
 
867
914
  #endif