ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -0,0 +1,352 @@
1
+ #include <string.h>
2
+ #include "index.h"
3
+ #include "array.h"
4
+ #include "helper.h"
5
+
6
+ /****************************************************************************
7
+ *
8
+ * TermVector
9
+ *
10
+ ****************************************************************************/
11
+
12
+ void tv_destroy(TermVector *tv)
13
+ {
14
+ int i = tv->term_cnt;
15
+ while (i > 0) {
16
+ i--;
17
+ free(tv->terms[i].text);
18
+ free(tv->terms[i].positions);
19
+ }
20
+ free(tv->offsets);
21
+ free(tv->field);
22
+ free(tv->terms);
23
+ free(tv);
24
+ }
25
+
26
+ int tv_get_tv_term_index(TermVector *tv, const char *term)
27
+ {
28
+ int lo = 0; /* search starts array */
29
+ int hi = tv->term_cnt - 1; /* for 1st element < n, return its index */
30
+ int mid;
31
+ int cmp;
32
+ char *mid_term;
33
+
34
+ while (hi >= lo) {
35
+ mid = (lo + hi) >> 1;
36
+ mid_term = tv->terms[mid].text;
37
+ cmp = strcmp(term, mid_term);
38
+ if (cmp < 0) {
39
+ hi = mid - 1;
40
+ }
41
+ else if (cmp > 0) {
42
+ lo = mid + 1;
43
+ }
44
+ else { /* found a match */
45
+ return mid;
46
+ }
47
+ }
48
+ if (strcmp(term, tv->terms[hi].text) == 0) {
49
+ return hi;
50
+ }
51
+ else {
52
+ return -1;
53
+ }
54
+ return hi;
55
+ }
56
+
57
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term)
58
+ {
59
+ int index = tv_get_tv_term_index(tv, term);
60
+ if (index >= 0) {
61
+ return &(tv->terms[index]);
62
+ }
63
+ else {
64
+ return NULL;
65
+ }
66
+ }
67
+
68
+ /****************************************************************************
69
+ *
70
+ * TermVectorsReader
71
+ *
72
+ ****************************************************************************/
73
+
74
+ TermVectorsReader *tvr_open(Store *store,
75
+ const char *segment,
76
+ FieldInfos *fis)
77
+ {
78
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
79
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
80
+
81
+ tvr->fis = fis;
82
+ sprintf(file_name, "%s.tvx", segment);
83
+ tvr->tvx_in = store->open_input(store, file_name);
84
+ tvr->size = is_length(tvr->tvx_in) / 12;
85
+
86
+ sprintf(file_name, "%s.tvd", segment);
87
+ tvr->tvd_in = store->open_input(store, file_name);
88
+ return tvr;
89
+ }
90
+
91
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig)
92
+ {
93
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
94
+
95
+ memcpy(tvr, orig, sizeof(TermVectorsReader));
96
+ tvr->tvx_in = is_clone(orig->tvx_in);
97
+ tvr->tvd_in = is_clone(orig->tvd_in);
98
+
99
+ return tvr;
100
+ }
101
+
102
+ void tvr_close(TermVectorsReader *tvr)
103
+ {
104
+ is_close(tvr->tvx_in);
105
+ is_close(tvr->tvd_in);
106
+ free(tvr);
107
+ }
108
+
109
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr, int field_num)
110
+ {
111
+ TermVector *tv = ALLOC_AND_ZERO(TermVector);
112
+ InStream *tvd_in = tvr->tvd_in;
113
+ FieldInfo *fi = tvr->fis->fields[field_num];
114
+ const int num_terms = is_read_vint(tvd_in);
115
+
116
+ tv->field_num = field_num;
117
+ tv->field = estrdup(fi->name);
118
+
119
+ if (num_terms > 0) {
120
+ int i, j, delta_start, delta_len, total_len, freq;
121
+ int store_positions = fi_store_positions(fi);
122
+ int store_offsets = fi_store_offsets(fi);
123
+ uchar buffer[MAX_WORD_SIZE];
124
+ TVTerm *term;
125
+
126
+ tv->term_cnt = num_terms;
127
+ tv->terms = ALLOC_AND_ZERO_N(TVTerm, num_terms);
128
+
129
+ for (i = 0; i < num_terms; i++) {
130
+ term = &(tv->terms[i]);
131
+ /* read delta encoded term */
132
+ delta_start = is_read_vint(tvd_in);
133
+ delta_len = is_read_vint(tvd_in);
134
+ total_len = delta_start + delta_len;
135
+ is_read_bytes(tvd_in, buffer + delta_start, delta_len);
136
+ buffer[total_len++] = '\0';
137
+ term->text = memcpy(ALLOC_N(char, total_len), buffer, total_len);
138
+
139
+ /* read freq */
140
+ freq = term->freq = is_read_vint(tvd_in);
141
+
142
+ /* read positions if necessary */
143
+ if (store_positions) {
144
+ int *positions = term->positions = ALLOC_N(int, freq);
145
+ int pos = 0;
146
+ for (j = 0; j < freq; j++) {
147
+ positions[j] = pos += is_read_vint(tvd_in);
148
+ }
149
+ }
150
+
151
+ /* read offsets if necessary */
152
+ }
153
+ if (store_offsets) {
154
+ int num_positions = tv->offset_cnt = is_read_vint(tvd_in);
155
+ Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
156
+ int offset = 0;
157
+ for (i = 0; i < num_positions; i++) {
158
+ offsets[i].start = offset += is_read_vint(tvd_in);
159
+ offsets[i].end = offset += is_read_vint(tvd_in);
160
+ }
161
+ }
162
+ }
163
+ return tv;
164
+ }
165
+
166
+ HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
167
+ {
168
+ HashTable *term_vectors = h_new_str((free_ft)NULL, (free_ft)&tv_destroy);
169
+ int i;
170
+ InStream *tvx_in = tvr->tvx_in;
171
+ InStream *tvd_in = tvr->tvd_in;
172
+ off_t data_ptr, field_index_ptr;
173
+ int field_cnt;
174
+ int *field_nums;
175
+
176
+ if (doc_num >= 0 && doc_num < tvr->size) {
177
+ is_seek(tvx_in, 12 * doc_num);
178
+
179
+ data_ptr = (off_t)is_read_u64(tvx_in);
180
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
181
+
182
+ /* scan fields to get position of field_num's term vector */
183
+ is_seek(tvd_in, field_index_ptr);
184
+
185
+ field_cnt = is_read_vint(tvd_in);
186
+ field_nums = ALLOC_N(int, field_cnt);
187
+
188
+ for (i = 0; i < field_cnt; i++) {
189
+ field_nums[i] = is_read_vint(tvd_in);
190
+ is_read_vint(tvd_in); /* skip space, we don't need it */
191
+ }
192
+ is_seek(tvd_in, data_ptr);
193
+
194
+ for (i = 0; i < field_cnt; i++) {
195
+ TermVector *tv = tvr_read_term_vector(tvr, field_nums[i]);
196
+ h_set(term_vectors, tv->field, tv);
197
+ }
198
+ free(field_nums);
199
+ }
200
+ return term_vectors;
201
+ }
202
+
203
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
204
+ int doc_num,
205
+ int field_num)
206
+ {
207
+ int i;
208
+ InStream *tvx_in = tvr->tvx_in;
209
+ InStream *tvd_in = tvr->tvd_in;
210
+ off_t data_ptr, field_index_ptr;
211
+ int field_cnt;
212
+ int offset = 0;
213
+ TermVector *tv = NULL;
214
+
215
+ if (doc_num >= 0 && doc_num < tvr->size) {
216
+ is_seek(tvx_in, 12 * doc_num);
217
+
218
+ data_ptr = (off_t)is_read_u64(tvx_in);
219
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
220
+
221
+ /* scan fields to get position of field_num's term vector */
222
+ is_seek(tvd_in, field_index_ptr);
223
+
224
+ field_cnt = is_read_vint(tvd_in);
225
+ for (i = 0; i < field_cnt; i++) {
226
+ if ((int)is_read_vint(tvd_in) == field_num) {
227
+ break;
228
+ }
229
+ offset += is_read_vint(tvd_in); /* space taken by field */
230
+ }
231
+ if (i < field_cnt) {
232
+ /* field was found */
233
+ is_seek(tvd_in, data_ptr + offset);
234
+ tv = tvr_read_term_vector(tvr, field_num);
235
+ }
236
+ }
237
+ return tv;
238
+ }
239
+
240
+ /****************************************************************************
241
+ *
242
+ * TermVectorsWriter
243
+ *
244
+ ****************************************************************************/
245
+
246
+ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
247
+ {
248
+ TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
249
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
250
+ tvw->fis = fis;
251
+ tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
252
+
253
+ sprintf(file_name, "%s.tvx", segment);
254
+ tvw->tvx_out = store->new_output(store, file_name);
255
+
256
+ sprintf(file_name, "%s.tvd", segment);
257
+ tvw->tvd_out = store->new_output(store, file_name);
258
+
259
+ return tvw;
260
+ }
261
+
262
+ void tvw_close(TermVectorsWriter *tvw)
263
+ {
264
+ os_close(tvw->tvx_out);
265
+ os_close(tvw->tvd_out);
266
+ ary_free(tvw->fields);
267
+ free(tvw);
268
+ }
269
+
270
+ void tvw_open_doc(TermVectorsWriter *tvw)
271
+ {
272
+ ary_size(tvw->fields) = 0;
273
+ tvw->tvd_ptr = os_pos(tvw->tvd_out);
274
+ os_write_u64(tvw->tvx_out, tvw->tvd_ptr);
275
+ }
276
+
277
+ void tvw_close_doc(TermVectorsWriter *tvw)
278
+ {
279
+ int i;
280
+ OutStream *tvd_out = tvw->tvd_out;
281
+ os_write_u32(tvw->tvx_out, (f_u32)(os_pos(tvw->tvd_out) - tvw->tvd_ptr));
282
+ os_write_vint(tvd_out, ary_size(tvw->fields));
283
+ for (i = 0; i < ary_size(tvw->fields); i++) {
284
+ os_write_vint(tvd_out, tvw->fields[i].field_num);
285
+ os_write_vint(tvd_out, tvw->fields[i].size);
286
+ }
287
+ }
288
+
289
+ void tvw_add_postings(TermVectorsWriter *tvw,
290
+ int field_num,
291
+ PostingList **plists,
292
+ int posting_count,
293
+ Offset *offsets,
294
+ int offset_count)
295
+ {
296
+ int i, delta_start, delta_length;
297
+ const char *last_term = EMPTY_STRING;
298
+ off_t tvd_start_pos = os_pos(tvw->tvd_out);
299
+ OutStream *tvd_out = tvw->tvd_out;
300
+ PostingList *plist;
301
+ Posting *posting;
302
+ Occurence *occ;
303
+ FieldInfo *fi = tvw->fis->fields[field_num];
304
+ int store_positions = fi_store_positions(fi);
305
+
306
+ ary_grow(tvw->fields);
307
+ ary_last(tvw->fields).field_num = field_num;
308
+
309
+ os_write_vint(tvd_out, posting_count);
310
+ for (i = 0; i < posting_count; i++) {
311
+ plist = plists[i];
312
+ posting = plist->last;
313
+ delta_start = hlp_string_diff(last_term, plist->term);
314
+ delta_length = plist->term_len - delta_start;
315
+
316
+ os_write_vint(tvd_out, delta_start); /* write shared prefix length */
317
+ os_write_vint(tvd_out, delta_length); /* write delta length */
318
+ /* write delta chars */
319
+ os_write_bytes(tvd_out,
320
+ (uchar *)(plist->term + delta_start),
321
+ delta_length);
322
+ os_write_vint(tvd_out, posting->freq);
323
+ last_term = plist->term;
324
+
325
+ if (store_positions) {
326
+ /* use delta encoding for positions */
327
+ int last_pos = 0;
328
+ for (occ = posting->first_occ; occ; occ = occ->next) {
329
+ os_write_vint(tvd_out, occ->pos - last_pos);
330
+ last_pos = occ->pos;
331
+ }
332
+ }
333
+
334
+ }
335
+
336
+ if (fi_store_offsets(fi)) {
337
+ /* use delta encoding for offsets */
338
+ int last_end = 0;
339
+ os_write_vint(tvd_out, offset_count); /* write shared prefix length */
340
+ for (i = 0; i < offset_count; i++) {
341
+ int start = offsets[i].start;
342
+ int end = offsets[i].end;
343
+ os_write_vint(tvd_out, start - last_end);
344
+ os_write_vint(tvd_out, end - start);
345
+ last_end = end;
346
+ }
347
+ }
348
+
349
+ ary_last(tvw->fields).size = os_pos(tvd_out) - tvd_start_pos;
350
+ }
351
+
352
+
data/ext/threading.h ADDED
@@ -0,0 +1,31 @@
1
+ #ifndef FRT_THREADING_H
2
+ #define FRT_THREADING_H
3
+
4
+ #include "hash.h"
5
+ #define UNTHREADED 1
6
+
7
+ typedef void * mutex_t;
8
+ typedef struct HashTable *thread_key_t;
9
+ typedef int thread_once_t;
10
+ #define MUTEX_INITIALIZER NULL
11
+ #define MUTEX_RECURSIVE_INITIALIZER NULL
12
+ #define THREAD_ONCE_INIT 1;
13
+ #define mutex_init(a, b)
14
+ #define mutex_lock(a)
15
+ #define mutex_trylock(a)
16
+ #define mutex_unlock(a)
17
+ #define mutex_destroy(a)
18
+ #define thread_key_create(a, b) frt_thread_key_create(a, b)
19
+ #define thread_key_delete(a) frt_thread_key_delete(a)
20
+ #define thread_setspecific(a, b) frt_thread_setspecific(a, b)
21
+ #define thread_getspecific(a) frt_thread_getspecific(a)
22
+ #define thread_exit(a)
23
+ #define thread_once(a, b) frt_thread_once(a, b)
24
+
25
+ void frt_thread_once(int *once_control, void (*init_routine)(void));
26
+ void frt_thread_key_create(thread_key_t *key, void (*destr_function)(void *));
27
+ void frt_thread_key_delete(thread_key_t key);
28
+ void frt_thread_setspecific(thread_key_t key, const void *pointer);
29
+ void *frt_thread_getspecific(thread_key_t key);
30
+
31
+ #endif
data/ext/win32.h ADDED
@@ -0,0 +1,54 @@
1
+ #include "global.h"
2
+
3
+ #ifndef FRT_WIN32_H
4
+ #define FRT_WIN32_H
5
+
6
+ #include <io.h>
7
+
8
+ struct dirent
9
+ {
10
+ char *d_name;
11
+ };
12
+
13
+ typedef struct DIR
14
+ {
15
+ struct _finddata_t find_data;
16
+ struct dirent de;
17
+ long handle;
18
+ } DIR;
19
+
20
+ DIR *opendir(const char *dirname)
21
+ {
22
+ DIR *d = ALLOC_AND_ZERO(DIR);
23
+ char dirname_buf[MAX_FILE_PATH];
24
+ long ff_res;
25
+ sprintf(dirname_buf, "%s\\*", dirname);
26
+ ff_res = _findfirst(dirname_buf, &d->find_data);
27
+ if (ff_res < 0) {
28
+ free(d);
29
+ d = NULL;
30
+ } else {
31
+ d->de.d_name = NULL;
32
+ d->handle = ff_res;
33
+ }
34
+ return d;
35
+ }
36
+
37
+ struct dirent *readdir(DIR *d)
38
+ {
39
+ /* _findfirst already returned so do _findnext */
40
+ if (d->de.d_name != NULL) {
41
+ if (_findnext(d->handle, &d->find_data) < 0) {
42
+ return NULL;
43
+ }
44
+ }
45
+ d->de.d_name = d->find_data.name;
46
+ return &d->de;
47
+ }
48
+
49
+ void closedir(DIR *d)
50
+ {
51
+ _findclose(d->handle);
52
+ free(d);
53
+ }
54
+ #endif
data/lib/ferret.rb CHANGED
@@ -21,20 +21,8 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
- module Ferret
25
- VERSION = '0.9.6'
26
- end
27
-
28
- # try and load the C extension but it isn't necessary.
29
- begin
30
- require 'ferret_ext'
31
- rescue Exception => e
32
- require 'ferret/utils'
33
- require 'ferret/document'
34
- require 'ferret/stemmers'
35
- require 'ferret/analysis'
36
- require 'ferret/store'
37
- require 'ferret/index'
38
- require 'ferret/search'
39
- require 'ferret/query_parser'
40
- end
24
+ $: << File.expand_path(File.join(File.dirname(__FILE__), "../ext"))
25
+ require 'ferret_ext'
26
+ require 'ferret_version'
27
+ require 'ferret/document'
28
+ require 'ferret/index'