ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -0,0 +1,352 @@
1
+ #include <string.h>
2
+ #include "index.h"
3
+ #include "array.h"
4
+ #include "helper.h"
5
+
6
+ /****************************************************************************
7
+ *
8
+ * TermVector
9
+ *
10
+ ****************************************************************************/
11
+
12
+ void tv_destroy(TermVector *tv)
13
+ {
14
+ int i = tv->term_cnt;
15
+ while (i > 0) {
16
+ i--;
17
+ free(tv->terms[i].text);
18
+ free(tv->terms[i].positions);
19
+ }
20
+ free(tv->offsets);
21
+ free(tv->field);
22
+ free(tv->terms);
23
+ free(tv);
24
+ }
25
+
26
+ int tv_get_tv_term_index(TermVector *tv, const char *term)
27
+ {
28
+ int lo = 0; /* search starts array */
29
+ int hi = tv->term_cnt - 1; /* for 1st element < n, return its index */
30
+ int mid;
31
+ int cmp;
32
+ char *mid_term;
33
+
34
+ while (hi >= lo) {
35
+ mid = (lo + hi) >> 1;
36
+ mid_term = tv->terms[mid].text;
37
+ cmp = strcmp(term, mid_term);
38
+ if (cmp < 0) {
39
+ hi = mid - 1;
40
+ }
41
+ else if (cmp > 0) {
42
+ lo = mid + 1;
43
+ }
44
+ else { /* found a match */
45
+ return mid;
46
+ }
47
+ }
48
+ if (strcmp(term, tv->terms[hi].text) == 0) {
49
+ return hi;
50
+ }
51
+ else {
52
+ return -1;
53
+ }
54
+ return hi;
55
+ }
56
+
57
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term)
58
+ {
59
+ int index = tv_get_tv_term_index(tv, term);
60
+ if (index >= 0) {
61
+ return &(tv->terms[index]);
62
+ }
63
+ else {
64
+ return NULL;
65
+ }
66
+ }
67
+
68
+ /****************************************************************************
69
+ *
70
+ * TermVectorsReader
71
+ *
72
+ ****************************************************************************/
73
+
74
+ TermVectorsReader *tvr_open(Store *store,
75
+ const char *segment,
76
+ FieldInfos *fis)
77
+ {
78
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
79
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
80
+
81
+ tvr->fis = fis;
82
+ sprintf(file_name, "%s.tvx", segment);
83
+ tvr->tvx_in = store->open_input(store, file_name);
84
+ tvr->size = is_length(tvr->tvx_in) / 12;
85
+
86
+ sprintf(file_name, "%s.tvd", segment);
87
+ tvr->tvd_in = store->open_input(store, file_name);
88
+ return tvr;
89
+ }
90
+
91
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig)
92
+ {
93
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
94
+
95
+ memcpy(tvr, orig, sizeof(TermVectorsReader));
96
+ tvr->tvx_in = is_clone(orig->tvx_in);
97
+ tvr->tvd_in = is_clone(orig->tvd_in);
98
+
99
+ return tvr;
100
+ }
101
+
102
+ void tvr_close(TermVectorsReader *tvr)
103
+ {
104
+ is_close(tvr->tvx_in);
105
+ is_close(tvr->tvd_in);
106
+ free(tvr);
107
+ }
108
+
109
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr, int field_num)
110
+ {
111
+ TermVector *tv = ALLOC_AND_ZERO(TermVector);
112
+ InStream *tvd_in = tvr->tvd_in;
113
+ FieldInfo *fi = tvr->fis->fields[field_num];
114
+ const int num_terms = is_read_vint(tvd_in);
115
+
116
+ tv->field_num = field_num;
117
+ tv->field = estrdup(fi->name);
118
+
119
+ if (num_terms > 0) {
120
+ int i, j, delta_start, delta_len, total_len, freq;
121
+ int store_positions = fi_store_positions(fi);
122
+ int store_offsets = fi_store_offsets(fi);
123
+ uchar buffer[MAX_WORD_SIZE];
124
+ TVTerm *term;
125
+
126
+ tv->term_cnt = num_terms;
127
+ tv->terms = ALLOC_AND_ZERO_N(TVTerm, num_terms);
128
+
129
+ for (i = 0; i < num_terms; i++) {
130
+ term = &(tv->terms[i]);
131
+ /* read delta encoded term */
132
+ delta_start = is_read_vint(tvd_in);
133
+ delta_len = is_read_vint(tvd_in);
134
+ total_len = delta_start + delta_len;
135
+ is_read_bytes(tvd_in, buffer + delta_start, delta_len);
136
+ buffer[total_len++] = '\0';
137
+ term->text = memcpy(ALLOC_N(char, total_len), buffer, total_len);
138
+
139
+ /* read freq */
140
+ freq = term->freq = is_read_vint(tvd_in);
141
+
142
+ /* read positions if necessary */
143
+ if (store_positions) {
144
+ int *positions = term->positions = ALLOC_N(int, freq);
145
+ int pos = 0;
146
+ for (j = 0; j < freq; j++) {
147
+ positions[j] = pos += is_read_vint(tvd_in);
148
+ }
149
+ }
150
+
151
+ /* read offsets if necessary */
152
+ }
153
+ if (store_offsets) {
154
+ int num_positions = tv->offset_cnt = is_read_vint(tvd_in);
155
+ Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
156
+ int offset = 0;
157
+ for (i = 0; i < num_positions; i++) {
158
+ offsets[i].start = offset += is_read_vint(tvd_in);
159
+ offsets[i].end = offset += is_read_vint(tvd_in);
160
+ }
161
+ }
162
+ }
163
+ return tv;
164
+ }
165
+
166
+ HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
167
+ {
168
+ HashTable *term_vectors = h_new_str((free_ft)NULL, (free_ft)&tv_destroy);
169
+ int i;
170
+ InStream *tvx_in = tvr->tvx_in;
171
+ InStream *tvd_in = tvr->tvd_in;
172
+ off_t data_ptr, field_index_ptr;
173
+ int field_cnt;
174
+ int *field_nums;
175
+
176
+ if (doc_num >= 0 && doc_num < tvr->size) {
177
+ is_seek(tvx_in, 12 * doc_num);
178
+
179
+ data_ptr = (off_t)is_read_u64(tvx_in);
180
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
181
+
182
+ /* scan fields to get position of field_num's term vector */
183
+ is_seek(tvd_in, field_index_ptr);
184
+
185
+ field_cnt = is_read_vint(tvd_in);
186
+ field_nums = ALLOC_N(int, field_cnt);
187
+
188
+ for (i = 0; i < field_cnt; i++) {
189
+ field_nums[i] = is_read_vint(tvd_in);
190
+ is_read_vint(tvd_in); /* skip space, we don't need it */
191
+ }
192
+ is_seek(tvd_in, data_ptr);
193
+
194
+ for (i = 0; i < field_cnt; i++) {
195
+ TermVector *tv = tvr_read_term_vector(tvr, field_nums[i]);
196
+ h_set(term_vectors, tv->field, tv);
197
+ }
198
+ free(field_nums);
199
+ }
200
+ return term_vectors;
201
+ }
202
+
203
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
204
+ int doc_num,
205
+ int field_num)
206
+ {
207
+ int i;
208
+ InStream *tvx_in = tvr->tvx_in;
209
+ InStream *tvd_in = tvr->tvd_in;
210
+ off_t data_ptr, field_index_ptr;
211
+ int field_cnt;
212
+ int offset = 0;
213
+ TermVector *tv = NULL;
214
+
215
+ if (doc_num >= 0 && doc_num < tvr->size) {
216
+ is_seek(tvx_in, 12 * doc_num);
217
+
218
+ data_ptr = (off_t)is_read_u64(tvx_in);
219
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
220
+
221
+ /* scan fields to get position of field_num's term vector */
222
+ is_seek(tvd_in, field_index_ptr);
223
+
224
+ field_cnt = is_read_vint(tvd_in);
225
+ for (i = 0; i < field_cnt; i++) {
226
+ if ((int)is_read_vint(tvd_in) == field_num) {
227
+ break;
228
+ }
229
+ offset += is_read_vint(tvd_in); /* space taken by field */
230
+ }
231
+ if (i < field_cnt) {
232
+ /* field was found */
233
+ is_seek(tvd_in, data_ptr + offset);
234
+ tv = tvr_read_term_vector(tvr, field_num);
235
+ }
236
+ }
237
+ return tv;
238
+ }
239
+
240
+ /****************************************************************************
241
+ *
242
+ * TermVectorsWriter
243
+ *
244
+ ****************************************************************************/
245
+
246
+ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
247
+ {
248
+ TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
249
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
250
+ tvw->fis = fis;
251
+ tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
252
+
253
+ sprintf(file_name, "%s.tvx", segment);
254
+ tvw->tvx_out = store->new_output(store, file_name);
255
+
256
+ sprintf(file_name, "%s.tvd", segment);
257
+ tvw->tvd_out = store->new_output(store, file_name);
258
+
259
+ return tvw;
260
+ }
261
+
262
+ void tvw_close(TermVectorsWriter *tvw)
263
+ {
264
+ os_close(tvw->tvx_out);
265
+ os_close(tvw->tvd_out);
266
+ ary_free(tvw->fields);
267
+ free(tvw);
268
+ }
269
+
270
+ void tvw_open_doc(TermVectorsWriter *tvw)
271
+ {
272
+ ary_size(tvw->fields) = 0;
273
+ tvw->tvd_ptr = os_pos(tvw->tvd_out);
274
+ os_write_u64(tvw->tvx_out, tvw->tvd_ptr);
275
+ }
276
+
277
+ void tvw_close_doc(TermVectorsWriter *tvw)
278
+ {
279
+ int i;
280
+ OutStream *tvd_out = tvw->tvd_out;
281
+ os_write_u32(tvw->tvx_out, (f_u32)(os_pos(tvw->tvd_out) - tvw->tvd_ptr));
282
+ os_write_vint(tvd_out, ary_size(tvw->fields));
283
+ for (i = 0; i < ary_size(tvw->fields); i++) {
284
+ os_write_vint(tvd_out, tvw->fields[i].field_num);
285
+ os_write_vint(tvd_out, tvw->fields[i].size);
286
+ }
287
+ }
288
+
289
+ void tvw_add_postings(TermVectorsWriter *tvw,
290
+ int field_num,
291
+ PostingList **plists,
292
+ int posting_count,
293
+ Offset *offsets,
294
+ int offset_count)
295
+ {
296
+ int i, delta_start, delta_length;
297
+ const char *last_term = EMPTY_STRING;
298
+ off_t tvd_start_pos = os_pos(tvw->tvd_out);
299
+ OutStream *tvd_out = tvw->tvd_out;
300
+ PostingList *plist;
301
+ Posting *posting;
302
+ Occurence *occ;
303
+ FieldInfo *fi = tvw->fis->fields[field_num];
304
+ int store_positions = fi_store_positions(fi);
305
+
306
+ ary_grow(tvw->fields);
307
+ ary_last(tvw->fields).field_num = field_num;
308
+
309
+ os_write_vint(tvd_out, posting_count);
310
+ for (i = 0; i < posting_count; i++) {
311
+ plist = plists[i];
312
+ posting = plist->last;
313
+ delta_start = hlp_string_diff(last_term, plist->term);
314
+ delta_length = plist->term_len - delta_start;
315
+
316
+ os_write_vint(tvd_out, delta_start); /* write shared prefix length */
317
+ os_write_vint(tvd_out, delta_length); /* write delta length */
318
+ /* write delta chars */
319
+ os_write_bytes(tvd_out,
320
+ (uchar *)(plist->term + delta_start),
321
+ delta_length);
322
+ os_write_vint(tvd_out, posting->freq);
323
+ last_term = plist->term;
324
+
325
+ if (store_positions) {
326
+ /* use delta encoding for positions */
327
+ int last_pos = 0;
328
+ for (occ = posting->first_occ; occ; occ = occ->next) {
329
+ os_write_vint(tvd_out, occ->pos - last_pos);
330
+ last_pos = occ->pos;
331
+ }
332
+ }
333
+
334
+ }
335
+
336
+ if (fi_store_offsets(fi)) {
337
+ /* use delta encoding for offsets */
338
+ int last_end = 0;
339
+ os_write_vint(tvd_out, offset_count); /* write shared prefix length */
340
+ for (i = 0; i < offset_count; i++) {
341
+ int start = offsets[i].start;
342
+ int end = offsets[i].end;
343
+ os_write_vint(tvd_out, start - last_end);
344
+ os_write_vint(tvd_out, end - start);
345
+ last_end = end;
346
+ }
347
+ }
348
+
349
+ ary_last(tvw->fields).size = os_pos(tvd_out) - tvd_start_pos;
350
+ }
351
+
352
+
data/ext/threading.h ADDED
@@ -0,0 +1,31 @@
1
+ #ifndef FRT_THREADING_H
2
+ #define FRT_THREADING_H
3
+
4
+ #include "hash.h"
5
+ #define UNTHREADED 1
6
+
7
+ typedef void * mutex_t;
8
+ typedef struct HashTable *thread_key_t;
9
+ typedef int thread_once_t;
10
+ #define MUTEX_INITIALIZER NULL
11
+ #define MUTEX_RECURSIVE_INITIALIZER NULL
12
+ #define THREAD_ONCE_INIT 1;
13
+ #define mutex_init(a, b)
14
+ #define mutex_lock(a)
15
+ #define mutex_trylock(a)
16
+ #define mutex_unlock(a)
17
+ #define mutex_destroy(a)
18
+ #define thread_key_create(a, b) frt_thread_key_create(a, b)
19
+ #define thread_key_delete(a) frt_thread_key_delete(a)
20
+ #define thread_setspecific(a, b) frt_thread_setspecific(a, b)
21
+ #define thread_getspecific(a) frt_thread_getspecific(a)
22
+ #define thread_exit(a)
23
+ #define thread_once(a, b) frt_thread_once(a, b)
24
+
25
+ void frt_thread_once(int *once_control, void (*init_routine)(void));
26
+ void frt_thread_key_create(thread_key_t *key, void (*destr_function)(void *));
27
+ void frt_thread_key_delete(thread_key_t key);
28
+ void frt_thread_setspecific(thread_key_t key, const void *pointer);
29
+ void *frt_thread_getspecific(thread_key_t key);
30
+
31
+ #endif
data/ext/win32.h ADDED
@@ -0,0 +1,54 @@
1
+ #include "global.h"
2
+
3
+ #ifndef FRT_WIN32_H
4
+ #define FRT_WIN32_H
5
+
6
+ #include <io.h>
7
+
8
+ struct dirent
9
+ {
10
+ char *d_name;
11
+ };
12
+
13
+ typedef struct DIR
14
+ {
15
+ struct _finddata_t find_data;
16
+ struct dirent de;
17
+ long handle;
18
+ } DIR;
19
+
20
+ DIR *opendir(const char *dirname)
21
+ {
22
+ DIR *d = ALLOC_AND_ZERO(DIR);
23
+ char dirname_buf[MAX_FILE_PATH];
24
+ long ff_res;
25
+ sprintf(dirname_buf, "%s\\*", dirname);
26
+ ff_res = _findfirst(dirname_buf, &d->find_data);
27
+ if (ff_res < 0) {
28
+ free(d);
29
+ d = NULL;
30
+ } else {
31
+ d->de.d_name = NULL;
32
+ d->handle = ff_res;
33
+ }
34
+ return d;
35
+ }
36
+
37
+ struct dirent *readdir(DIR *d)
38
+ {
39
+ /* _findfirst already returned so do _findnext */
40
+ if (d->de.d_name != NULL) {
41
+ if (_findnext(d->handle, &d->find_data) < 0) {
42
+ return NULL;
43
+ }
44
+ }
45
+ d->de.d_name = d->find_data.name;
46
+ return &d->de;
47
+ }
48
+
49
+ void closedir(DIR *d)
50
+ {
51
+ _findclose(d->handle);
52
+ free(d);
53
+ }
54
+ #endif
data/lib/ferret.rb CHANGED
@@ -21,20 +21,8 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
- module Ferret
25
- VERSION = '0.9.6'
26
- end
27
-
28
- # try and load the C extension but it isn't necessary.
29
- begin
30
- require 'ferret_ext'
31
- rescue Exception => e
32
- require 'ferret/utils'
33
- require 'ferret/document'
34
- require 'ferret/stemmers'
35
- require 'ferret/analysis'
36
- require 'ferret/store'
37
- require 'ferret/index'
38
- require 'ferret/search'
39
- require 'ferret/query_parser'
40
- end
24
+ $: << File.expand_path(File.join(File.dirname(__FILE__), "../ext"))
25
+ require 'ferret_ext'
26
+ require 'ferret_version'
27
+ require 'ferret/document'
28
+ require 'ferret/index'