ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/vector.c DELETED
@@ -1,637 +0,0 @@
1
- #include "index.h"
2
- #include "helper.h"
3
- #include <string.h>
4
-
5
- static char * const NULL_POS_ERROR_MSG = "Trying to write positions that are null!";
6
- static char * const NULL_OFFSETS_ERROR_MSG = "Trying to write offsets that are null!";
7
- static char * const FIELD_OPEN_ERROR_MSG = "Field is still open while writing document";
8
- static char * const FORMAT_VERSION_ERROR_MSG = "Invalid format version";
9
-
10
- #define TERM_ARR_START_SIZE 16
11
- #define FIELD_ARR_START_SIZE 8
12
-
13
- TVOffsetInfo *tvoi_create(int start, int end)
14
- {
15
- TVOffsetInfo *tvoi = ALLOC(TVOffsetInfo);
16
- tvoi->start = start;
17
- tvoi->end = end;
18
- return tvoi;
19
- }
20
-
21
- void tvoi_destroy(void *p)
22
- {
23
- free(p);
24
- }
25
-
26
- TVField *tvf_create(int number, int store_positions, int store_offsets)
27
- {
28
- TVField *tvf = ALLOC(TVField);
29
- tvf->tvf_pointer = 0;
30
- tvf->number = number;
31
- tvf->store_positions = store_positions;
32
- tvf->store_offsets = store_offsets;
33
- return tvf;
34
- }
35
-
36
- void tvf_destroy(void *p)
37
- {
38
- free(p);
39
- }
40
-
41
- TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
42
- {
43
- TVTerm *tvt = ALLOC(TVTerm);
44
- tvt->text = text;
45
- tvt->freq = freq;
46
- tvt->positions = positions;
47
- tvt->offsets = offsets;
48
- return tvt;
49
- }
50
-
51
- void tvt_destroy(void *p)
52
- {
53
- free(p);
54
- }
55
-
56
-
57
- TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
58
- {
59
- char fname[SEGMENT_NAME_MAX_LENGTH];
60
- size_t segment_len = strlen(segment);
61
- TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
62
- OutStream *os;
63
-
64
- tvw->curr_field = NULL;
65
- tvw->curr_doc_pointer = -1;
66
-
67
- /* Open files for TermVector storage */
68
- strcpy(fname, segment);
69
-
70
- strcpy(fname + segment_len, TVX_EXTENSION);
71
- os = tvw->tvx = store->create_output(store, fname);
72
- os_write_int(os, FORMAT_VERSION);
73
-
74
- strcpy(fname + segment_len, TVD_EXTENSION);
75
- os = tvw->tvd = store->create_output(store, fname);
76
- os_write_int(os, FORMAT_VERSION);
77
-
78
- strcpy(fname + segment_len, TVF_EXTENSION);
79
- os = tvw->tvf = store->create_output(store, fname);
80
- os_write_int(os, FORMAT_VERSION);
81
-
82
- tvw->fis = fis;
83
-
84
- tvw->fields = NULL;
85
- tvw->fcnt = 0;
86
- tvw->fsize = 0;
87
- tvw->terms = NULL;
88
- tvw->tcnt = 0;
89
- tvw->tsize = 0;
90
-
91
- return tvw;
92
- }
93
-
94
- void tvw_write_field(TermVectorsWriter *tvw)
95
- {
96
- int i, j, start, length;
97
- char *last_term_text;
98
- TVOffsetInfo *tmp_offset;
99
- TVTerm **terms = tvw->terms;
100
- TVTerm *term;
101
- OutStream *tvf = tvw->tvf;
102
- int store_positions = tvw->curr_field->store_positions;
103
- int store_offsets = tvw->curr_field->store_offsets;
104
- uchar bits = 0x0;
105
-
106
- /* remember where this field is written */
107
- tvw->curr_field->tvf_pointer = os_pos(tvf);
108
-
109
- /* write the number of terms */
110
- os_write_vint(tvf, tvw->tcnt);
111
-
112
- if (store_positions) {
113
- bits |= STORE_POSITIONS_WITH_TERMVECTOR;
114
- }
115
-
116
- if (store_offsets) {
117
- bits |= STORE_OFFSET_WITH_TERMVECTOR;
118
- }
119
-
120
- os_write_byte(tvf, (uchar)bits);
121
-
122
- last_term_text = (char *)EMPTY_STRING;
123
- for (i = 0; i < tvw->tcnt; i++) {
124
- term = terms[i];
125
- start = hlp_string_diff(last_term_text, term->text);
126
- length = (int)strlen(term->text) - start;
127
- os_write_vint(tvf, start); /* write shared prefix length */
128
- os_write_vint(tvf, length); /* write delta length */
129
- os_write_chars(tvf, term->text, start, length); /* write delta chars */
130
- os_write_vint(tvf, term->freq);
131
- last_term_text = term->text;
132
-
133
- if (store_positions) {
134
- int last_pos = 0;
135
-
136
- if (term->positions == NULL) {
137
- RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
138
- }
139
-
140
- /* use delta encoding for positions */
141
- for (j = 0; j < term->freq; j++) {
142
- os_write_vint(tvf, term->positions[j] - last_pos);
143
- last_pos = term->positions[j];
144
- }
145
- }
146
-
147
- if (store_offsets) {
148
- int last_end = 0;
149
-
150
- if (term->offsets == NULL) {
151
- RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
152
- }
153
-
154
- /* use delta encoding for offsets */
155
- for (j = 0; j < term->freq; j++) {
156
- tmp_offset = term->offsets[j];
157
- os_write_vint(tvf, tmp_offset->start - last_end);
158
-
159
- /* save the diff between the two */
160
- os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
161
- last_end = tmp_offset->end;
162
- }
163
- }
164
- }
165
- }
166
-
167
- void tvw_close_field(TermVectorsWriter *tvw)
168
- {
169
- int i;
170
- if (tvw->curr_field != NULL) {
171
- /* save field and terms */
172
- tvw_write_field(tvw);
173
-
174
- if (tvw->fcnt >= tvw->fsize) {
175
- tvw->fsize *=2;
176
- if (tvw->fsize < FIELD_ARR_START_SIZE) {
177
- tvw->fsize = FIELD_ARR_START_SIZE;
178
- }
179
- REALLOC_N(tvw->fields, TVField *, tvw->fsize);
180
- }
181
- tvw->fields[tvw->fcnt] = tvw->curr_field;
182
- tvw->fcnt++;
183
-
184
- for (i = 0; i < tvw->tcnt; i++) {
185
- tvt_destroy(tvw->terms[i]);
186
- }
187
- tvw->tcnt = 0;
188
-
189
- tvw->curr_field = NULL;
190
- }
191
- }
192
-
193
- void tvw_create_field(TermVectorsWriter *tvw,
194
- int field_number, int store_position, int store_offset)
195
- {
196
- tvw_close_field(tvw);
197
- tvw->curr_field = tvf_create(field_number, store_position, store_offset);
198
- }
199
-
200
- void tvw_open_field(TermVectorsWriter *tvw, char *field)
201
- {
202
- FieldInfo *fi = fis_get_fi(tvw->fis, field);
203
- tvw_create_field(tvw, fi->number, fi->store_pos, fi->store_offset);
204
- }
205
-
206
- void tvw_write_doc(TermVectorsWriter *tvw)
207
- {
208
- OutStream *tvd = tvw->tvd;
209
- int i;
210
- TVField **fields = tvw->fields;
211
- int last_field_pointer = 0;
212
-
213
- if (tvw->curr_field != NULL) {
214
- RAISE(STATE_ERROR, FIELD_OPEN_ERROR_MSG);
215
- }
216
-
217
- //printf("Writing doc pointer: %d\n", tvw->curr_doc_pointer);
218
- /* write document index record */
219
- os_write_long(tvw->tvx, tvw->curr_doc_pointer);
220
-
221
- //printf("Writing field count: %ld, %d, %d -> ", (long long)tvw, tvw->fcnt, os_pos(tvd));
222
- /* write the number of @fields */
223
- os_write_vint(tvd, tvw->fcnt);
224
-
225
- /* write field numbers */
226
- for (i = 0; i < tvw->fcnt; i++) {
227
- os_write_vint(tvd, fields[i]->number);
228
- }
229
-
230
- /* write field pointers */
231
- for (i = 0; i < tvw->fcnt; i++) {
232
- os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
233
- last_field_pointer = fields[i]->tvf_pointer;
234
- }
235
- //printf("%d\n", os_pos(tvw->tvd));
236
- }
237
-
238
- void tvw_close_doc(TermVectorsWriter *tvw)
239
- {
240
- int i;
241
- if (tvw->curr_doc_pointer >= 0) {
242
- tvw_close_field(tvw);
243
- tvw_write_doc(tvw);
244
-
245
- for (i = 0; i < tvw->fcnt; i++) {
246
- tvf_destroy(tvw->fields[i]);
247
- }
248
- tvw->fcnt = 0;
249
- tvw->curr_doc_pointer = -1;
250
- }
251
- }
252
-
253
- void tvw_open_doc(TermVectorsWriter *tvw)
254
- {
255
- tvw_close_doc(tvw);
256
- tvw->curr_doc_pointer = os_pos(tvw->tvd);
257
- }
258
-
259
- void tvw_add_term(TermVectorsWriter *tvw,
260
- char *text, int freq, int *positions, TVOffsetInfo **offsets)
261
- {
262
- if (tvw->tcnt >= tvw->tsize) {
263
- tvw->tsize *= 2;
264
- if (tvw->tsize < TERM_ARR_START_SIZE) {
265
- tvw->tsize = TERM_ARR_START_SIZE;
266
- }
267
-
268
- REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
269
- }
270
- tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
271
- tvw->tcnt++;
272
- }
273
-
274
- void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
275
- {
276
- int i, j, store_positions, store_offsets;
277
- TermVector *tv;
278
-
279
- tvw_open_doc(tvw);
280
-
281
- for (i = 0; i < vectors->size; i++) {
282
- tv = vectors->elems[i];
283
-
284
- store_positions = (tv->tcnt > 0 && tv->positions != NULL);
285
- store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
286
-
287
- tvw_create_field(tvw, (int)fis_get_number(tvw->fis, tv->field),
288
- store_positions, store_offsets);
289
-
290
- if (store_positions && store_offsets) {
291
- for (j = 0; j < tv->tcnt; j++) {
292
- tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
293
- }
294
- } else if (store_positions) {
295
- for (j = 0; j < tv->tcnt; j++) {
296
- tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
297
- }
298
- } else if (store_offsets) {
299
- for (j = 0; j < tv->tcnt; j++) {
300
- tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
301
- }
302
- } else {
303
- for (j = 0; j < tv->tcnt; j++) {
304
- tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
305
- }
306
- }
307
- tvw_close_field(tvw);
308
- }
309
- tvw_close_doc(tvw);
310
- }
311
-
312
- void tvw_close(TermVectorsWriter *tvw)
313
- {
314
- /* put everything in the finally block so that even if they throw an
315
- * exception, everything else will also be closed. */
316
- TRY
317
- XFINALLY
318
- tvw_close_doc(tvw);
319
- os_close(tvw->tvx);
320
- os_close(tvw->tvd);
321
- os_close(tvw->tvf);
322
- XENDTRY
323
- free(tvw->terms);
324
- free(tvw->fields);
325
- free(tvw);
326
- }
327
-
328
- TermVector *tv_create(
329
- const char *field,
330
- char **terms,
331
- int tcnt,
332
- int *freqs,
333
- int **positions,
334
- TVOffsetInfo ***offsets)
335
- {
336
- TermVector *tv =
337
- ALLOC(TermVector);
338
- tv->field = (char *)field;
339
- tv->terms = terms;
340
- tv->tcnt = tcnt;
341
- tv->freqs = freqs;
342
- tv->positions = positions;
343
- tv->offsets = offsets;
344
- return tv;
345
- }
346
-
347
- void tv_destroy(TermVector *tv)
348
- {
349
- int i, j;
350
- for (i = 0; i < tv->tcnt; i++) {
351
- free(tv->terms[i]);
352
- }
353
- free(tv->terms);
354
- if (tv->positions != NULL) {
355
- for (i = 0; i < tv->tcnt; i++) {
356
- free(tv->positions[i]);
357
- }
358
- free(tv->positions);
359
- }
360
- if (tv->offsets != NULL) {
361
- for (i = 0; i < tv->tcnt; i++) {
362
- for (j = 0; j < tv->freqs[i]; j++) {
363
- tvoi_destroy(tv->offsets[i][j]);
364
- }
365
- free(tv->offsets[i]);
366
- }
367
- free(tv->offsets);
368
- }
369
- free(tv->freqs);
370
- free(tv);
371
- }
372
-
373
- void tv_destroy_except_data(TermVector *tv)
374
- {
375
- free(tv->terms);
376
- if (tv->positions != NULL) {
377
- free(tv->positions);
378
- }
379
- if (tv->offsets != NULL) {
380
- free(tv->offsets);
381
- }
382
- free(tv->freqs);
383
- free(tv);
384
- }
385
-
386
- int tvr_check_valid_format(InStream *is)
387
- {
388
- int format = is_read_int(is);
389
- if (format > FORMAT_VERSION)
390
- RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
391
- return format;
392
- }
393
-
394
- TermVectorsReader *tvr_clone(TermVectorsReader *orig)
395
- {
396
- TermVectorsReader *clone = NULL;
397
- clone = ALLOC(TermVectorsReader);
398
- memcpy(clone, orig, sizeof(TermVectorsReader));
399
- if (orig->tvx && orig->tvd && orig->tvf) {
400
- clone->tvx = is_clone(orig->tvx);
401
- clone->tvd = is_clone(orig->tvd);
402
- clone->tvf = is_clone(orig->tvf);
403
- }
404
- return clone;
405
- }
406
-
407
- TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
408
- {
409
- TermVectorsReader *tvr = ALLOC(TermVectorsReader);
410
- // Open files for TermVector storage
411
- char fname[SEGMENT_NAME_MAX_LENGTH];
412
- size_t segment_len = strlen(segment);
413
- InStream *is;
414
-
415
- strcpy(fname, segment);
416
-
417
- strcpy(fname + segment_len, TVX_EXTENSION);
418
- if (!store->exists(store, fname)) {
419
- tvr->tvx = tvr->tvd = tvr->tvf = NULL;
420
- tvr->size = 0;
421
- } else {
422
- is = tvr->tvx = store->open_input(store, fname);
423
- tvr_check_valid_format(is);
424
- tvr->size = is_length(is)/8;
425
-
426
- strcpy(fname + segment_len, TVD_EXTENSION);
427
- is = tvr->tvd = store->open_input(store, fname);
428
- tvr->tvd_format = tvr_check_valid_format(is);
429
-
430
- strcpy(fname + segment_len, TVF_EXTENSION);
431
- is = tvr->tvf = store->open_input(store, fname);
432
- tvr->tvf_format = tvr_check_valid_format(is);
433
-
434
- tvr->fis = fis;
435
- }
436
- return tvr;
437
- }
438
-
439
- void tvr_close(TermVectorsReader *tvr)
440
- {
441
- /* put everything in the finally block so that even if they throw an
442
- * exception, everything else will also be closed. */
443
- TRY
444
- XFINALLY
445
- if (tvr->tvx) {
446
- is_close(tvr->tvx);
447
- }
448
- if (tvr->tvd) {
449
- is_close(tvr->tvd);
450
- }
451
- if (tvr->tvf) {
452
- is_close(tvr->tvf);
453
- }
454
- free(tvr);
455
- XENDTRY
456
- }
457
-
458
- TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
459
- char *field, int tvf_pointer)
460
- {
461
- int i, j, store_positions, store_offsets, bits, num_terms;
462
- char **terms;
463
- int *term_freqs;
464
-
465
- /* we may not need these, but declare them */
466
- int **positions = NULL;
467
- TVOffsetInfo ***offsets = NULL;
468
- int start, delta_length, total_length, freq, prev_pos;
469
- int start_offset, end_offset, prev_offset;
470
- int *pos;
471
- TVOffsetInfo **offs;
472
- char buffer[MAX_WORD_SIZE] = "";
473
-
474
- /* Now read the data from specified position. We don't need to offset
475
- * offset by the FORMAT here since the pointer already includes the offset */
476
- is_seek(tvr->tvf, tvf_pointer);
477
- num_terms = (int)is_read_vint(tvr->tvf);
478
-
479
- /* If no terms - return a constant empty termvector. However, this should
480
- * never occur! */
481
- if (num_terms == 0) {
482
- return tv_create(field, NULL, 0, NULL, NULL, NULL);
483
- }
484
-
485
- if(tvr->tvf_format == FORMAT_VERSION) {
486
- bits = is_read_byte(tvr->tvf);
487
- store_positions = ((bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0);
488
- store_offsets = ((bits & STORE_OFFSET_WITH_TERMVECTOR) != 0);
489
- } else {
490
- is_read_vint(tvr->tvf);
491
- store_positions = false;
492
- store_offsets = false;
493
- }
494
-
495
- terms = ALLOC_N(char *, num_terms);
496
- term_freqs = ALLOC_N(int, num_terms);
497
-
498
- if (store_positions) {
499
- positions = ALLOC_N(int *, num_terms);
500
- }
501
-
502
- if (store_offsets) {
503
- offsets = ALLOC_N(TVOffsetInfo **, num_terms);
504
- }
505
-
506
-
507
- for (i = 0; i < num_terms; i++) {
508
- start = (int)is_read_vint(tvr->tvf);
509
- delta_length = (int)is_read_vint(tvr->tvf);
510
- total_length = start + delta_length;
511
- is_read_chars(tvr->tvf, buffer, start, delta_length);
512
- buffer[total_length] = '\0';
513
- terms[i] = estrdup(buffer);
514
- freq = (int)is_read_vint(tvr->tvf);
515
- term_freqs[i] = freq;
516
-
517
- if (store_positions) {/* read in the positions */
518
- pos = ALLOC_N(int, freq);
519
- positions[i] = pos;
520
- prev_pos = 0;
521
- for (j = 0; j < freq; j++) {
522
- pos[j] = prev_pos + (int)is_read_vint(tvr->tvf);
523
- prev_pos = pos[j];
524
- }
525
- }
526
-
527
- if (store_offsets) {
528
- offs = ALLOC_N(TVOffsetInfo *, freq);
529
- offsets[i] = offs;
530
- prev_offset = 0;
531
- for (j = 0; j < freq; j++) {
532
- start_offset = prev_offset + (int)is_read_vint(tvr->tvf);
533
- end_offset = start_offset + (int)is_read_vint(tvr->tvf);
534
- offs[j] = tvoi_create(start_offset, end_offset);
535
- prev_offset = end_offset;
536
- }
537
- }
538
- }
539
- return tv_create(field, terms, num_terms, term_freqs, positions, offsets);
540
- }
541
-
542
- Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
543
- {
544
- int i;
545
- Array *tvs = NULL;
546
- /* Check if no term vectors are available for this segment at all */
547
- if (tvr->tvx != NULL) {
548
- int position, field_count;
549
- /* We need to offset by */
550
- is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
551
-
552
- position = (int)is_read_long(tvr->tvx);
553
-
554
- is_seek(tvr->tvd, position);
555
- field_count = (int)is_read_vint(tvr->tvd);
556
-
557
- /* No fields are vectorized for this document */
558
- if (field_count > 0) {
559
- int number = 0;
560
- int position = 0;
561
- int *tvf_pointers = ALLOC_N(int, field_count);
562
- char **fields = ALLOC_N(char *, field_count);
563
-
564
- for (i = 0; i < field_count; i++) {
565
- if (tvr->tvd_format == FORMAT_VERSION) {
566
- number = (int)is_read_vint(tvr->tvd);
567
- } else {
568
- number += (int)is_read_vint(tvr->tvd);
569
- }
570
-
571
- fields[i] = tvr->fis->by_number[number]->name;
572
- }
573
-
574
- /* Compute position in the tvf file */
575
- for (i = 0; i < field_count; i++) {
576
- position += (int)is_read_vint(tvr->tvd);
577
- tvf_pointers[i] = position;
578
- }
579
-
580
- tvs = ary_create(field_count, (free_ft)&tv_destroy);
581
- for (i = 0; i < field_count; i++) {
582
- ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
583
- }
584
- free(fields);
585
- free(tvf_pointers);
586
- }
587
- }
588
- return tvs;
589
- }
590
-
591
- TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
592
- {
593
- int i;
594
- /* Check if no term vectors are available for this segment at all */
595
- int field_number = (int)fis_get_number(tvr->fis, field);
596
- TermVector *tv = NULL;
597
-
598
- if (tvr->tvx != NULL) {
599
- int pos, field_count, number = 0, found = -1;
600
- /* We need to account for the FORMAT_SIZE at when seeking in the @tvx
601
- * We don't need to do this in other seeks because we already have the
602
- * file pointer that was written in another file */
603
- is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
604
- //printf("TVX Pointer: %d\n", is_pos(tvr->tvx));
605
- pos = (int)is_read_long(tvr->tvx);
606
-
607
- is_seek(tvr->tvd, pos);
608
- field_count = (int)is_read_vint(tvr->tvd);
609
- //printf("Num Fields: %d\n", field_count);
610
- /* There are only a few fields per document. We opt for a full scan
611
- * rather then requiring that they be ordered. We need to read through
612
- * all of the fields anyway to get to the tvf pointers. */
613
- for (i = 0; i < field_count; i++) {
614
- if (tvr->tvd_format == FORMAT_VERSION) {
615
- number = (int)is_read_vint(tvr->tvd);
616
- } else {
617
- number += (int)is_read_vint(tvr->tvd);
618
- }
619
-
620
- if (number == field_number) {
621
- found = i;
622
- }
623
- }
624
-
625
- /* This field, although valid in the segment, was not found in this
626
- * document */
627
- if (found != -1) {
628
- /* Compute pos in the tvf file */
629
- pos = 0;
630
- for (i = 0; i <= found; i++) {
631
- pos += (int)is_read_vint(tvr->tvd);
632
- }
633
- tv = tvr_read_term_vector(tvr, field, pos);
634
- }
635
- }
636
- return tv;
637
- }