ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/defines.h ADDED
@@ -0,0 +1,49 @@
1
+ #ifndef FRT_DEFINES_H
2
+ #define FRT_DEFINES_H
3
+
4
+ #include <sys/types.h>
5
+ #include "posh.h"
6
+
7
+ #ifndef false
8
+ #define false 0
9
+ #endif
10
+ #ifndef true
11
+ #define true 1
12
+ #endif
13
+
14
+ typedef unsigned int bool;
15
+ typedef unsigned char uchar;
16
+ typedef unsigned int uint;
17
+ typedef unsigned long int ulong;
18
+
19
+ typedef posh_u16_t f_u16;
20
+ typedef posh_i16_t f_i16;
21
+ typedef posh_u32_t f_u32;
22
+ typedef posh_i32_t f_i32;
23
+ typedef posh_u64_t f_u64;
24
+ typedef posh_i64_t f_i64;
25
+
26
+ #if defined(_FILE_OFFSET_BITS) && (_FILE_OFFSET_BITS == 64)
27
+ #define F_OFF_T_PFX "ll"
28
+ #else
29
+ #define F_OFF_T_PFX "l"
30
+ #endif
31
+
32
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
33
+ #define FRT_HAS_INLINE
34
+ #define FRT_IS_C99
35
+ #define FRT_HAS_ISO_VARARGS
36
+ #define FRT_HAS_VARARGS
37
+ #endif
38
+
39
+ #if defined(__GNUC__) && !defined(__STRICT_ANSI__)
40
+ #define FRT_HAS_INLINE
41
+ #define FRT_HAS_GNUC_VARARGS
42
+ #define FRT_HAS_VARARGS
43
+ #endif
44
+
45
+ #ifndef FRT_HAS_INLINE
46
+ # define inline
47
+ #endif
48
+
49
+ #endif
data/ext/document.c CHANGED
@@ -7,192 +7,86 @@
7
7
  *
8
8
  ****************************************************************************/
9
9
 
10
- static char * const INDEX_NO_STORE_NO_MSG = "it doesn't make sense to have a field that is neither indexed nor stored";
11
- static char * const INDEX_NO_TV_YES_MSG = "cannot store term vector information for a field that is not indexed";
12
- static char * const INVALID_STORE_VAL_MSG = "Invalid value for store in document field";
13
- static char * const INVALID_INDEX_VAL_MSG = "Invalid value for index in document field";
14
- static char * const INVALID_TV_VAL_MSG = "Invalid value for term_vector in document field";
15
- static char * const BIN_FIELD_STORE_NO_MSG = "It doesn't make sense not to store binary data";
16
-
17
- /**
18
- * @throws ARG_ERROR
19
- */
20
- inline void df_set(DocField *df, const char *name,
21
- char *data, int store, int index, int tv)
10
+ DocField *df_new(const char *name)
22
11
  {
23
- if ((index == DF_INDEX_NO) && (store == DF_STORE_NO))
24
- RAISE(ARG_ERROR, INDEX_NO_STORE_NO_MSG);
25
- if ((index == DF_INDEX_NO) && (tv != DF_TERM_VECTOR_NO))
26
- RAISE(ARG_ERROR, INDEX_NO_TV_YES_MSG);
27
- df->name = estrdup(name);
28
- df->data = data;
29
- df->blen = (int)strlen(data);
30
- df_set_store(df, store);
31
- df_set_index(df, index);
32
- df_set_term_vector(df, tv);
33
- df->is_binary = false;
34
- df->boost = 1.0;
12
+ DocField *df = ALLOC(DocField);
13
+ df->name = estrdup(name);
14
+ df->size = 0;
15
+ df->capa = DF_INIT_CAPA;
16
+ df->data = ALLOC_N(char *, df->capa);
17
+ df->lengths = ALLOC_N(int, df->capa);
18
+ df->destroy_data = false;
19
+ df->boost = 1.0;
20
+ return df;
35
21
  }
36
22
 
37
- /*
38
- * @throws ARG_ERROR
39
- */
40
- DocField *df_create(const char *name, char *data, int store, int index, int tv)
23
+ DocField *df_add_data_len(DocField *df, char *data, int len)
41
24
  {
42
- DocField *df = ALLOC(DocField);
43
- df_set(df, name, data, store, index, tv);
44
- return df;
25
+ if (df->size >= df->capa) {
26
+ df->capa <<= 2;
27
+ REALLOC_N(df->data, char *, df->capa);
28
+ REALLOC_N(df->lengths, int, df->capa);
29
+ }
30
+ df->data[df->size] = data;
31
+ df->lengths[df->size] = len;
32
+ df->size++;
33
+ return df;
45
34
  }
46
35
 
47
- DocField *df_clone(DocField *self)
36
+ DocField *df_add_data(DocField *df, char *data)
48
37
  {
49
- DocField *clone = ALLOC(DocField);
50
- memcpy(clone, self, sizeof(DocField));
51
- clone->name = estrdup(self->name);
52
- clone->data = estrdup(self->data);
53
- return clone;
38
+ return df_add_data_len(df, data, strlen(data));
54
39
  }
55
40
 
56
41
  void df_destroy(DocField *df)
57
42
  {
58
- free(df->name);
59
- free(df);
60
- }
61
-
62
- void df_destroy_data(DocField *df)
63
- {
64
- free(df->data);
65
- free(df->name);
66
- free(df);
67
- }
68
-
69
- /*
70
- * @throws ARG_ERROR
71
- */
72
- void df_set_store(DocField *df, int store)
73
- {
74
- switch (store) {
75
- case DF_STORE_YES:
76
- df->is_stored = true;
77
- df->is_compressed = false;
78
- break;
79
- case DF_STORE_NO:
80
- df->is_stored = false;
81
- df->is_compressed = false;
82
- break;
83
- case DF_STORE_COMPRESS:
84
- df->is_stored = true;
85
- df->is_compressed = true;
86
- break;
87
- default:
88
- RAISE(ARG_ERROR, INVALID_STORE_VAL_MSG);
89
- }
90
- }
91
-
92
- /*
93
- * @throws ARG_ERROR
94
- */
95
- void df_set_index(DocField *df, int index)
96
- {
97
- df->omit_norms = false;
98
- switch (index) {
99
- case DF_INDEX_NO:
100
- df->is_indexed = false;
101
- df->is_tokenized = false;
102
- break;
103
- case DF_INDEX_TOKENIZED:
104
- df->is_indexed = true;
105
- df->is_tokenized = true;
106
- break;
107
- case DF_INDEX_UNTOKENIZED:
108
- df->is_indexed = true;
109
- df->is_tokenized = false;
110
- break;
111
- case DF_INDEX_NO_NORMS:
112
- df->is_indexed = true;
113
- df->is_tokenized = false;
114
- df->omit_norms = true;
115
- break;
116
- default:
117
- RAISE(ARG_ERROR, INVALID_INDEX_VAL_MSG);
118
- }
119
- }
120
-
121
- /*
122
- * @throws ARG_ERROR
123
- */
124
- void df_set_term_vector(DocField *df, int tv)
125
- {
126
- switch (tv) {
127
- case DF_TERM_VECTOR_NO:
128
- df->store_tv = false;
129
- df->store_offset = false;
130
- df->store_pos = false;
131
- break;
132
- case DF_TERM_VECTOR_YES:
133
- df->store_tv = true;
134
- df->store_offset = false;
135
- df->store_pos = false;
136
- break;
137
- case DF_TERM_VECTOR_WITH_OFFSETS:
138
- df->store_tv = true;
139
- df->store_offset = true;
140
- df->store_pos = false;
141
- break;
142
- case DF_TERM_VECTOR_WITH_POSITIONS:
143
- df->store_tv = true;
144
- df->store_offset = false;
145
- df->store_pos = true;
146
- break;
147
- case DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS:
148
- df->store_tv = true;
149
- df->store_offset = true;
150
- df->store_pos = true;
151
- break;
152
- default:
153
- RAISE(ARG_ERROR, INVALID_TV_VAL_MSG);
154
- }
155
- }
156
-
157
- /*
158
- * @throws ARG_ERROR
159
- */
160
- DocField *df_create_binary(char *name, char *data, int blen, int store)
161
- {
162
- DocField *df;
163
-
164
- if (store == DF_STORE_NO) {
165
- RAISE(ARG_ERROR, BIN_FIELD_STORE_NO_MSG);
166
- }
167
-
168
- df = df_create(name, data, store, DF_INDEX_NO, DF_TERM_VECTOR_NO);
169
- df->is_binary = true;
170
- df->blen = blen;
171
- return df;
43
+ if (df->destroy_data) {
44
+ int i;
45
+ for (i = 0; i < df->size; i++) {
46
+ free(df->data[i]);
47
+ }
48
+ }
49
+ free(df->data);
50
+ free(df->lengths);
51
+ free(df->name);
52
+ free(df);
172
53
  }
173
54
 
174
- char *df_to_s(DocField *self)
55
+ char *df_to_s(DocField *df)
175
56
  {
176
- /* the length of the str is name.len + data.len + 119, add safety 10 */
177
- char *str = ALLOC_N(char, strlen(self->name) + strlen(self->data) + 129);
178
- char *str_ptr = str;
179
-
180
- if (self->is_stored) {
181
- sprintf(str, "stored/%s,", self->is_compressed ? "compressed" : "uncompressed");
182
- str_ptr = str + strlen(str);
183
- }
184
- sprintf(str_ptr, "%s%s%s%s%s%s%s<%s:%s>",
185
- self->is_indexed ? "indexed," : "",
186
- self->is_tokenized ? "tokenized," : "",
187
- self->store_tv ? "store_term_vector," : "",
188
- self->store_offset ? "store_offsets," : "",
189
- self->store_pos ? "store_positions," : "",
190
- self->omit_norms ? "omit_norms," : "",
191
- self->is_binary ? "binary," : "",
192
- self->name,
193
- self->is_binary ? "=bin_data=" : self->data);
194
-
195
- return str;
57
+ int i;
58
+ int len = strlen(df->name) + 10;
59
+ char *str, *s;
60
+ for (i = 0; i < df->size; i++) {
61
+ len += df->lengths[i] + 5;
62
+ }
63
+ s = str = ALLOC_N(char, len);
64
+ sprintf(str, "%s: ", df->name);
65
+ s += strlen(str);
66
+ if (df->size == 1) {
67
+ *(s++) = '"';
68
+ strncpy(s, df->data[0], df->lengths[0]);
69
+ s += df->lengths[0];
70
+ *(s++) = '"';
71
+ *(s++) = '\0';
72
+ }
73
+ else {
74
+ *(s++) = '[';
75
+ *(s++) = '"';
76
+ strncpy(s, df->data[0], df->lengths[0]);
77
+ s += df->lengths[0];
78
+ *(s++) = '"';
79
+ for (i = 1; i < df->size; i++) {
80
+ *(s++) = ',';
81
+ *(s++) = ' ';
82
+ *(s++) = '"';
83
+ strncpy(s, df->data[i], df->lengths[i]);
84
+ s += df->lengths[i];
85
+ *(s++) = '"';
86
+ }
87
+ sprintf(s, "]");
88
+ }
89
+ return str;
196
90
  }
197
91
 
198
92
  /****************************************************************************
@@ -201,166 +95,62 @@ char *df_to_s(DocField *self)
201
95
  *
202
96
  ****************************************************************************/
203
97
 
204
- Document *doc_create()
205
- {
206
- Document *doc = ALLOC(Document);
207
- doc->fields = h_new_str(&free, (free_ft)&ary_destroy);
208
- doc->fcnt = 0;
209
- doc->dfcnt = 0;
210
- doc->field_arr = NULL;
211
- doc->df_arr = NULL;
212
- doc->boost = 1.0;
213
- doc->free_data = (free_ft)&df_destroy_data;
214
- return doc;
215
- }
216
-
217
- Document *doc_create_keep_data()
98
+ Document *doc_new()
218
99
  {
219
- Document *doc = doc_create();
220
- doc->free_data = (free_ft)&df_destroy;
221
- return doc;
100
+ Document *doc = ALLOC(Document);
101
+ doc->field_dict = h_new_str(NULL, (free_ft)&df_destroy);
102
+ doc->size = 0;
103
+ doc->capa = DOC_INIT_CAPA;
104
+ doc->fields = ALLOC_N(DocField *, doc->capa);
105
+ doc->boost = 1.0;
106
+ return doc;
222
107
  }
223
108
 
224
- void doc_destroy(Document *doc)
109
+ DocField *doc_add_field(Document *doc, DocField *df)
225
110
  {
226
- int i;
227
- if (doc->free_data) {
228
- for (i = 0; i < doc->dfcnt; i++) {
229
- doc->free_data(doc->df_arr[i]);
111
+ if (!h_set_safe(doc->field_dict, df->name, df)) {
112
+ RAISE(EXCEPTION, "tried to add %s field which alread existed\n",
113
+ df->name);
230
114
  }
231
- }
232
- free(doc->field_arr);
233
- free(doc->df_arr);
234
- h_destroy(doc->fields);
235
- free(doc);
236
- }
237
-
238
- void doc_add_field(Document *doc, DocField *df)
239
- {
240
- Array *fields = (Array *)h_get(doc->fields, df->name);
241
- if (fields == NULL) {
242
- fields = ary_create(1, NULL);
243
- h_set(doc->fields, estrdup(df->name), fields);
244
- doc->fcnt++;
245
- REALLOC_N(doc->field_arr, Array *, doc->fcnt);
246
- doc->field_arr[doc->fcnt-1] = fields;
247
- }
248
- ary_append(fields, df);
249
- doc->dfcnt++;
250
- REALLOC_N(doc->df_arr, DocField *, doc->dfcnt);
251
- doc->df_arr[doc->dfcnt-1] = df;
252
- }
253
-
254
- DocField *doc_get_field(Document *doc, const char *fname)
255
- {
256
- Array *fields = (Array *)h_get(doc->fields, fname);
257
- if (fields) {
258
- return fields->elems[0];
259
- } else {
260
- return NULL;
261
- }
115
+ if (doc->size >= doc->capa) {
116
+ doc->capa <<= 1;
117
+ REALLOC_N(doc->fields, DocField *, doc->capa);
118
+ }
119
+ doc->fields[doc->size] = df;
120
+ doc->size++;
121
+ return df;
262
122
  }
263
123
 
264
- Array *doc_get_fields(Document *doc, const char *fname)
124
+ DocField *doc_get_field(Document *doc, const char *name)
265
125
  {
266
- return (Array *)h_get(doc->fields, fname);
126
+ return h_get(doc->field_dict, name);
267
127
  }
268
128
 
269
- /**
270
- * TODO:
271
- * This is not exactly elegant or efficient but it works and is not going to
272
- * be a performance problem. Still, it would be nice to make the code a little
273
- * clearer.
274
- */
275
- Array *doc_remove_fields(Document *doc, const char *fname)
129
+ char *doc_to_s(Document *doc)
276
130
  {
277
- Array *fields = (Array *)h_rem(doc->fields, fname, true);
278
- if (fields) {
279
- int i, j;
280
- doc->fcnt--;
281
- for (i = 0; i < doc->fcnt; i++) {
282
- if (fields == doc->field_arr[i]) {
283
- memmove(&doc->field_arr[i],
284
- &doc->field_arr[i+1],
285
- sizeof(void *) * (doc->fcnt - i));
286
- break;
287
- }
288
- }
289
- for (i = 0, j = 0; i < doc->dfcnt && j < fields->size;) {
290
- if (fields->elems[j] == doc->df_arr[i]) {
291
- memmove(&doc->df_arr[i],
292
- &doc->df_arr[i+1],
293
- sizeof(void *) * (doc->dfcnt - i - 1));
294
- j++;
295
- doc->dfcnt--;
296
- } else {
297
- i++;
298
- }
131
+ int i;
132
+ int len = 100;
133
+ char **fields = ALLOC_N(char *, doc->size);
134
+ char *buf, *s;
135
+ for (i = 0; i < doc->size; i++) {
136
+ fields[i] = df_to_s(doc->fields[i]);
137
+ len += strlen(fields[i]) + 10;
299
138
  }
300
- fields->free_elem = doc->free_data;
301
- return fields;
302
- } else {
303
- return NULL;
304
- }
305
- }
306
-
307
- DocField *doc_remove_field(Document *doc, const char *fname)
308
- {
309
- DocField *df = NULL;
310
- Array *dfs = (Array *)h_get(doc->fields, fname);
311
- if (dfs) {
312
- df = ary_remove(dfs, 0);
313
- if (dfs->size == 0) {
314
- Array *fields = doc_remove_fields(doc, fname);
315
- fields->free_elem = doc->free_data;
316
- ary_destroy(fields);
317
- } else {
318
- int i;
319
- for (i = 0; i < doc->dfcnt; i++) {
320
- if (df == doc->df_arr[i]) {
321
- memmove(&doc->df_arr[i],
322
- &doc->df_arr[i+1],
323
- sizeof(void *) * (doc->dfcnt - i - 1));
324
- doc->dfcnt--;
325
- break;
326
- }
327
- }
139
+ s = buf = ALLOC_N(char, len);
140
+ sprintf(buf, "Document [\n");
141
+ s += strlen(buf);
142
+ for (i = 0; i < doc->size; i++) {
143
+ sprintf(s, " =>%s\n", fields[i]);
144
+ free(fields[i]);
145
+ s += strlen(s);
328
146
  }
329
- }
330
- return df;
147
+ return buf;
331
148
  }
332
149
 
333
- bool doc_delete_fields(Document *doc, const char *fname)
150
+ void doc_destroy(Document *doc)
334
151
  {
335
- Array *fields = doc_remove_fields(doc, fname);
336
- if (fields) {
337
- fields->free_elem = doc->free_data;
338
- ary_destroy(fields);
339
- return true;
340
- } else {
341
- return false;
342
- }
152
+ h_destroy(doc->field_dict);
153
+ free(doc->fields);
154
+ free(doc);
343
155
  }
344
156
 
345
- char *doc_to_s(Document *doc)
346
- {
347
- int i, len = 20;
348
- char *str, *str_ptr;
349
- char **df_strs = ALLOC_N(char *, doc->dfcnt);
350
- for (i = 0; i < doc->dfcnt; i++) {
351
- df_strs[i] = df_to_s(doc->df_arr[i]);
352
- len += (int)strlen(df_strs[i]) + 3;
353
- }
354
- str_ptr = str = ALLOC_N(char, len);
355
- sprintf(str_ptr, "Document {\n");
356
- str_ptr += strlen(str_ptr);
357
- for (i = 0; i < doc->dfcnt; i++) {
358
- sprintf(str_ptr, " %s\n", df_strs[i]);
359
- free(df_strs[i]);
360
- str_ptr += strlen(str_ptr);
361
- }
362
- sprintf(str_ptr, "}");
363
- free(df_strs);
364
-
365
- return str;
366
- }