ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/bitvector.c CHANGED
@@ -1,168 +1,526 @@
1
- #include <bitvector.h>
1
+ #include "bitvector.h"
2
2
  #include <string.h>
3
3
 
4
- BitVector *bv_create_size(int size)
4
+ BitVector *bv_new_capa(int capa)
5
5
  {
6
- BitVector *bv = ALLOC(BitVector);
6
+ BitVector *bv = ALLOC(BitVector);
7
7
 
8
- bv->capa = (size >> 3) + 1;
9
- bv->bits = ALLOC_N(uchar, bv->capa);
10
- memset(bv->bits, 0, bv->capa);
8
+ /* The capacity passed by the user is number of bits allowed, however we
9
+ * store capacity as the number of words (U32) allocated. */
10
+ bv->capa = (capa >> 5) + 1;
11
+ bv->bits = ALLOC_AND_ZERO_N(f_u32, bv->capa);
11
12
 
12
- bv->size = 0;
13
- bv->count = 0;
14
- bv->curr_bit = -1;
15
- return bv;
13
+ bv->size = 0;
14
+ bv->count = 0;
15
+ bv->curr_bit = -1;
16
+ bv->extends_as_ones = 0;
17
+ return bv;
16
18
  }
17
19
 
18
- BitVector *bv_create()
20
+ BitVector *bv_new()
19
21
  {
20
- return bv_create_size(BV_INIT_CAPA);
22
+ return bv_new_capa(BV_INIT_CAPA);
21
23
  }
22
24
 
23
- void bv_destroy(BitVector *bv)
25
+ void bv_destroy(BitVector * bv)
24
26
  {
25
- free(bv->bits);
26
- free(bv);
27
+ free(bv->bits);
28
+ free(bv);
27
29
  }
28
30
 
29
- void bv_set(BitVector *bv, int bit)
31
+ void bv_set(BitVector * bv, int bit)
30
32
  {
31
- uchar *byte_p;
32
- int byte = bit>>3;
33
- uchar bitmask = 1<<(bit&7);
34
- if (bv->size <= byte) {
35
- bv->size = byte + 1;
36
- if (bv->size >= bv->capa) {
37
- int capa = bv->capa * 2;
38
- while (capa < bv->size) capa *= 2;
39
- REALLOC_N(bv->bits, uchar, capa);
40
- memset(bv->bits + bv->capa, 0, capa - bv->capa);
41
- bv->capa = capa;
33
+ f_u32 *word_p;
34
+ int word = bit >> 5;
35
+ f_u32 bitmask = 1 << (bit & 31);
36
+
37
+ /* Check to see if we need to grow the BitVector */
38
+ if (bit >= bv->size) {
39
+ bv->size = bit + 1; /* size is max range of bits set */
40
+ if (word >= bv->capa) {
41
+ int capa = bv->capa << 1;
42
+ while (capa <= word) {
43
+ capa <<= 1;
44
+ }
45
+ REALLOC_N(bv->bits, f_u32, capa);
46
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
47
+ sizeof(f_u32) * (capa - bv->capa));
48
+ bv->capa = capa;
49
+ }
42
50
  }
43
- }
44
- byte_p = &(bv->bits[byte]);
45
- if ((bitmask & *byte_p) == 0) {
51
+
52
+ /* Set the required bit */
53
+ word_p = &(bv->bits[word]);
54
+ if ((bitmask & *word_p) == 0) {
55
+ bv->count++; /* update count */
56
+ *word_p |= bitmask;
57
+ }
58
+ }
59
+
60
+ /*
61
+ * This method relies on the fact that enough space has been set for the bits
62
+ * to be set. You need to create the BitVector using bv_new_capa(capa) with
63
+ * a capacity larger than any bit being set.
64
+ */
65
+ void bv_set_fast(BitVector * bv, int bit)
66
+ {
46
67
  bv->count++;
47
- *byte_p |= bitmask;
48
- }
68
+ bv->size = bit;
69
+ bv->bits[bit >> 5] |= 1 << (bit & 31);
49
70
  }
50
71
 
51
- int bv_get(BitVector *bv, int bit)
72
+ int bv_get(BitVector * bv, int bit)
52
73
  {
53
- int byte = bit>>3;
54
- if (byte >= bv->size) return 0;
55
- return (bv->bits[byte]>>(bit&7))&1;
74
+ /* out of range so return 0 because it can't have been set */
75
+ if (bit >= bv->size) {
76
+ return bv->extends_as_ones;
77
+ }
78
+ return (bv->bits[bit >> 5] >> (bit & 31)) & 0x01;
56
79
  }
57
80
 
58
- void bv_clear(BitVector *bv)
81
+ void bv_clear(BitVector * bv)
59
82
  {
60
- memset(bv->bits, 0, bv->size);
61
- bv->count = 0;
83
+ memset(bv->bits, 0, bv->capa * sizeof(f_u32));
84
+ bv->extends_as_ones = 0;
85
+ bv->count = 0;
86
+ bv->size = 0;
62
87
  }
63
88
 
64
- void bv_unset(BitVector *bv, int bit)
89
+ /*
90
+ * FIXME: if the top set bit is unset, size is not adjusted. This will not
91
+ * cause any bugs in this code but could cause problems if users are relying
92
+ * on the fact that size is accurate.
93
+ */
94
+ void bv_unset(BitVector * bv, int bit)
95
+ {
96
+ f_u32 *word_p;
97
+ f_u32 bitmask;
98
+ int word = bit >> 5;
99
+
100
+ if (bit >= bv->size) {
101
+ bv->size = bit + 1; /* size is max range of bits set */
102
+ if (word >= bv->capa) {
103
+ int capa = bv->capa << 1;
104
+
105
+ while (capa <= word) {
106
+ capa <<= 1;
107
+ }
108
+ REALLOC_N(bv->bits, f_u32, capa);
109
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
110
+ sizeof(f_u32) * (capa - bv->capa));
111
+ bv->capa = capa;
112
+ }
113
+ }
114
+
115
+ word_p = &(bv->bits[word]);
116
+ bitmask = 1 << (bit & 31);
117
+ if ((bitmask & *word_p) > 0) {
118
+ bv->count--; /* update count */
119
+ *word_p &= ~bitmask;
120
+ }
121
+ }
122
+
123
+ /* Table of bits per char. This table is used by the bv_recount method to
124
+ * optimize the counting of bits */
125
+ static const uchar BYTE_COUNTS[] = {
126
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
127
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
128
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
129
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
131
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
132
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
133
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
134
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
135
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
136
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
137
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
138
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
139
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
140
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
141
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
142
+ };
143
+
144
+ int bv_recount(BitVector * bv)
65
145
  {
66
- uchar *byte_p;
67
- uchar bitmask;
68
- int byte = bit>>3;
69
-
70
- if (byte >= bv->size) return;
71
-
72
- byte_p = &(bv->bits[byte]);
73
- bitmask = 1<<(bit&7);
74
- if ((bitmask & *byte_p) > 0) {
75
- bv->count--;
76
- *byte_p &= ~bitmask;
77
- }
146
+ /* if the vector has been modified */
147
+ int i, c = 0;
148
+ uchar *bytes = (uchar *)bv->bits; /* count by character */
149
+ const int num_bytes = (((bv->size >> 5) + 1) << 2);
150
+ if (bv->extends_as_ones) {
151
+ for (i = 0; i < num_bytes; i++) {
152
+ c += BYTE_COUNTS[~(bytes[i]) & 0xFF]; /* sum bits per char */
153
+ }
154
+ }
155
+ else {
156
+ for (i = 0; i < num_bytes; i++) {
157
+ c += BYTE_COUNTS[bytes[i]]; /* sum bits per char */
158
+ }
159
+ }
160
+ bv->count = c;
161
+ return c;
78
162
  }
79
163
 
80
- void bv_write(BitVector *bv, Store *store, char *name)
164
+ void bv_scan_reset(BitVector * bv)
81
165
  {
82
- OutStream *os = store->create_output(store, name);
83
- os_write_vint(os, bv->size);
84
- os_write_bytes(os, bv->bits, bv->size);
85
- os_close(os);
166
+ bv->curr_bit = -1;
86
167
  }
87
168
 
88
- const uchar BYTE_COUNTS[] = { // table of bits/char
89
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
169
+ /* Table showing the number of trailing 0s in a char. This is used to optimize
170
+ * the bv_scan_next method. */
171
+ const int NUM_TRAILING_ZEROS[] = {
172
+ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
173
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
174
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
175
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
176
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
177
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
178
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
179
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
180
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
181
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
182
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
183
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
184
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
185
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
186
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
187
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
105
188
  };
106
189
 
107
- int bv_count(BitVector *bv)
190
+ /*
191
+ * This method is highly optimized, hence the loop unrolling
192
+ */
193
+ static inline int bv_get_1_offset(f_u32 word)
194
+ {
195
+ if (word & 0xff) {
196
+ return NUM_TRAILING_ZEROS[word & 0xff];
197
+ }
198
+ else {
199
+ word >>= 8;
200
+ if (word & 0xff) {
201
+ return NUM_TRAILING_ZEROS[word & 0xff] + 8;
202
+ }
203
+ else {
204
+ word >>= 8;
205
+ if (word & 0xff) {
206
+ return NUM_TRAILING_ZEROS[word & 0xff] + 16;
207
+ }
208
+ else {
209
+ word >>= 8;
210
+ return NUM_TRAILING_ZEROS[word & 0xff] + 24;
211
+ }
212
+ }
213
+ }
214
+ }
215
+ /*
216
+ * second fastest;
217
+ *
218
+ * while ((inc = NUM_TRAILING_ZEROS[word & 0xff]) == 8) {
219
+ * word >>= 8;
220
+ * bit_pos += 8;
221
+ * }
222
+ *
223
+ * third fastest;
224
+ *
225
+ * bit_pos += inc;
226
+ * if ((word & 0xffff) == 0) {
227
+ * bit_pos += 16;
228
+ * word >>= 16;
229
+ * }
230
+ * if ((word & 0xff) == 0) {
231
+ * bit_pos += 8;
232
+ * word >>= 8;
233
+ * }
234
+ * bit_pos += NUM_TRAILING_ZEROS[word & 0xff];
235
+ */
236
+
237
+ int bv_scan_next_from(BitVector * bv, register const int from)
238
+ {
239
+ register const f_u32 *const bits = bv->bits;
240
+ register const int word_size = (bv->size >> 5) + 1;
241
+ register int word_pos = from >> 5;
242
+ register int bit_pos = (from & 31);
243
+ register f_u32 word = bits[word_pos] >> bit_pos;
244
+
245
+ if (from >= bv->size) {
246
+ return -1;
247
+ }
248
+ if (word == 0) {
249
+ bit_pos = 0;
250
+ do {
251
+ word_pos++;
252
+ if (word_pos >= word_size) {
253
+ return -1;
254
+ }
255
+ } while (bits[word_pos] == 0);
256
+ word = bits[word_pos];
257
+ }
258
+
259
+ /* check the word a byte at a time as the NUM_TRAILING_ZEROS table would
260
+ * be too large for 32-bit integer or even a 16-bit integer */
261
+ bit_pos += bv_get_1_offset(word);
262
+
263
+ return bv->curr_bit = ((word_pos << 5) + bit_pos);
264
+ }
265
+
266
+ int bv_scan_next(BitVector * bv)
267
+ {
268
+ return bv_scan_next_from(bv, bv->curr_bit + 1);
269
+ }
270
+
271
+ int bv_scan_next_unset_from(BitVector * bv, register const int from)
272
+ {
273
+ register const f_u32 *const bits = bv->bits;
274
+ register const int word_size = (bv->size >> 5) + 1;
275
+ register int word_pos = from >> 5;
276
+ register int bit_pos = (from & 31);
277
+ register f_u32 word = ~(~(bits[word_pos]) >> bit_pos);
278
+
279
+ if (from >= bv->size) {
280
+ return -1;
281
+ }
282
+ if (word == 0xFFFFFFFF) {
283
+ bit_pos = 0;
284
+ do {
285
+ word_pos++;
286
+ if (word_pos >= word_size) {
287
+ return -1;
288
+ }
289
+ } while (bits[word_pos] == 0xFFFFFFFF);
290
+ word = bits[word_pos];
291
+ }
292
+
293
+ bit_pos += bv_get_1_offset(~word);
294
+
295
+ return bv->curr_bit = ((word_pos << 5) + bit_pos);
296
+ }
297
+
298
+ int bv_scan_next_unset(BitVector * bv)
299
+ {
300
+ return bv_scan_next_unset_from(bv, bv->curr_bit + 1);
301
+ }
302
+
303
+ int bv_eq(BitVector *bv1, BitVector *bv2)
304
+ {
305
+ if (bv1 == bv2) {
306
+ return true;
307
+ }
308
+ else if (bv1->extends_as_ones != bv2->extends_as_ones) {
309
+ return false;
310
+ }
311
+ else {
312
+ f_u32 *bits = bv1->bits;
313
+ f_u32 *bits2 = bv2->bits;
314
+ int min_size = min2(bv1->size, bv2->size);
315
+ int word_size = (min_size >> 5) + 1;
316
+ int ext_word_size = 0;
317
+
318
+ int i;
319
+
320
+ for (i = 0; i < word_size; i++) {
321
+ if (bits[i] != bits2[i]) {
322
+ return false;
323
+ }
324
+ }
325
+ if (bv1->size > min_size) {
326
+ bits = bv1->bits;
327
+ ext_word_size = (bv1->size >> 5) + 1;
328
+ }
329
+ else if (bv2->size > min_size) {
330
+ bits = bv2->bits;
331
+ ext_word_size = (bv2->size >> 5) + 1;
332
+ }
333
+ if (ext_word_size) {
334
+ const f_u32 expected = (bv1->extends_as_ones ? 0xFFFFFFFF : 0);
335
+ for (i = word_size; i < ext_word_size; i++) {
336
+ if (bits[i] != expected) {
337
+ return false;
338
+ }
339
+ }
340
+ }
341
+ }
342
+ return true;
343
+ }
344
+
345
+ ulong bv_hash(BitVector *bv)
108
346
  {
109
- // if the vector has been modified
110
- int i, c = 0;
111
- uchar *bytes = bv->bits;
112
- for (i = 0; i < bv->size; i++)
113
- c += BYTE_COUNTS[bytes[i]]; // sum bits per char
114
- bv->count = c;
115
- return c;
347
+ ulong hash = 0;
348
+ const f_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
349
+ int i;
350
+ for (i = (bv->size >> 5); i >= 0; i--) {
351
+ const f_u32 word = bv->bits[i];
352
+ if (word != empty_word) {
353
+ hash = (hash << 1) ^ word;
354
+ }
355
+ }
356
+ hash = (hash << 1) | bv->extends_as_ones;
357
+ return hash;
358
+ }
359
+
360
+ static BitVector *bv_and_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
361
+ {
362
+ int i;
363
+ int min_size = min2(bv1->size, bv2->size);
364
+ int word_size = (min_size >> 5) + 1;
365
+ int capa = 4;
366
+ while (capa < word_size) {
367
+ capa <<= 1;
368
+ }
369
+ REALLOC_N(bv->bits, f_u32, capa);
370
+ bv->capa = capa;
371
+ bv->size = min_size;
372
+
373
+ if (bv1->extends_as_ones && bv2->extends_as_ones) {
374
+ bv->extends_as_ones = true;
375
+ }
376
+ else {
377
+ bv->extends_as_ones = false;
378
+ }
379
+
380
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
381
+ sizeof(f_u32) * (capa - word_size));
382
+
383
+ for (i = 0; i < word_size; i++) {
384
+ bv->bits[i] = bv1->bits[i] & bv2->bits[i];
385
+ }
386
+ bv_recount(bv);
387
+ return bv;
388
+ }
389
+
390
+ BitVector *bv_and(BitVector *bv1, BitVector *bv2)
391
+ {
392
+ return bv_and_i(bv_new(), bv1, bv2);
116
393
  }
117
394
 
118
- BitVector *bv_read(Store *store, char *name)
395
+ BitVector *bv_and_x(BitVector *bv1, BitVector *bv2)
119
396
  {
120
- BitVector *bv = ALLOC(BitVector);
121
- InStream *is = store->open_input(store, name);
122
- bv->capa = bv->size = (int)is_read_vint(is);
123
- bv->bits = ALLOC_N(uchar, bv->capa);
124
- is_read_bytes(is, bv->bits, 0, bv->size);
125
- is_close(is);
126
- bv_count(bv);
127
- return bv;
397
+ return bv_and_i(bv1, bv1, bv2);
128
398
  }
129
399
 
130
- void bv_scan_reset(BitVector *bv)
400
+ static inline void bv_recapa(BitVector *bv, int new_capa)
131
401
  {
132
- bv->curr_bit = -1;
402
+ if (bv->capa < new_capa) {
403
+ REALLOC_N(bv->bits, f_u32, new_capa);
404
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
405
+ sizeof(f_u32) * (new_capa - bv->capa));
406
+ bv->capa = new_capa;
407
+ }
133
408
  }
134
409
 
135
- inline int bv_scan_next_from(BitVector *bv, register const int from)
410
+ static BitVector *bv_or_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
136
411
  {
137
- register const uchar *const bits = bv->bits;
138
- register const int size = bv->size;
139
- register int byte_pos = (from) >> 3;
140
- register int inc = ((from) & 7);
141
- register int bit = 1 << inc;
142
- register int mask = 0xff << inc;
143
- register int byte;
412
+ int i;
413
+ int max_size = max2(bv1->size, bv2->size);
414
+ int word_size = (max_size >> 5) + 1;
415
+ int capa = 4;
416
+ while (capa < word_size) {
417
+ capa <<= 1;
418
+ }
419
+ REALLOC_N(bv->bits, f_u32, capa);
420
+ bv->capa = capa;
421
+ bv->size = max_size;
144
422
 
145
- if (byte_pos >= size) return -1;
146
- if ((bits[byte_pos]&mask) == 0) {
147
- inc = 0;
148
- bit = 1;
149
- do {
150
- byte_pos++;
151
- if (byte_pos >= size) return -1;
152
- } while (bits[byte_pos] == 0);
153
- }
423
+ bv_recapa(bv1, capa);
424
+ bv_recapa(bv2, capa);
154
425
 
155
- byte = bits[byte_pos];
156
- while ((byte & bit) == 0) {
157
- bit <<= 1;
158
- inc++;
159
- }
426
+ if (bv1->extends_as_ones || bv2->extends_as_ones) {
427
+ bv->extends_as_ones = true;
428
+ }
429
+ else {
430
+ bv->extends_as_ones = false;
431
+ }
432
+
433
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
434
+ sizeof(f_u32) * (capa - word_size));
160
435
 
161
- return bv->curr_bit = ((byte_pos << 3) + inc);
436
+ for (i = 0; i < word_size; i++) {
437
+ bv->bits[i] = bv1->bits[i] | bv2->bits[i];
438
+ }
439
+ bv_recount(bv);
440
+ return bv;
162
441
  }
163
442
 
164
- inline int bv_scan_next(BitVector *bv)
443
+ BitVector *bv_or(BitVector *bv1, BitVector *bv2)
165
444
  {
166
- return bv_scan_next_from(bv, bv->curr_bit+1);
445
+ return bv_or_i(bv_new(), bv1, bv2);
167
446
  }
168
447
 
448
+ BitVector *bv_or_x(BitVector *bv1, BitVector *bv2)
449
+ {
450
+ return bv_or_i(bv1, bv1, bv2);
451
+ }
452
+
453
+ static BitVector *bv_xor_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
454
+ {
455
+ int i;
456
+ int max_size = max2(bv1->size, bv2->size);
457
+ int word_size = (max_size >> 5) + 1;
458
+ int capa = 4;
459
+ while (capa < word_size) {
460
+ capa <<= 1;
461
+ }
462
+ REALLOC_N(bv->bits, f_u32, capa);
463
+ bv->capa = capa;
464
+ bv->size = max_size;
465
+
466
+ bv_recapa(bv1, capa);
467
+ bv_recapa(bv2, capa);
468
+
469
+ if (bv1->extends_as_ones != bv2->extends_as_ones) {
470
+ bv->extends_as_ones = true;
471
+ }
472
+ else {
473
+ bv->extends_as_ones = false;
474
+ }
475
+
476
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
477
+ sizeof(f_u32) * (capa - word_size));
478
+
479
+ for (i = 0; i < word_size; i++) {
480
+ bv->bits[i] = bv1->bits[i] ^ bv2->bits[i];
481
+ }
482
+ bv_recount(bv);
483
+ return bv;
484
+ }
485
+
486
+ BitVector *bv_xor(BitVector *bv1, BitVector *bv2)
487
+ {
488
+ return bv_xor_i(bv_new(), bv1, bv2);
489
+ }
490
+
491
+ BitVector *bv_xor_x(BitVector *bv1, BitVector *bv2)
492
+ {
493
+ return bv_xor_i(bv1, bv1, bv2);
494
+ }
495
+
496
+ static BitVector *bv_not_i(BitVector *bv, BitVector *bv1)
497
+ {
498
+ int i;
499
+ int word_size = (bv1->size >> 5) + 1;
500
+ int capa = 4;
501
+ while (capa < word_size) {
502
+ capa <<= 1;
503
+ }
504
+ REALLOC_N(bv->bits, f_u32, capa);
505
+ bv->capa = capa;
506
+ bv->size = bv1->size;
507
+ bv->extends_as_ones = 1 - bv1->extends_as_ones;
508
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
509
+ sizeof(f_u32) * (capa - word_size));
510
+
511
+ for (i = 0; i < word_size; i++) {
512
+ bv->bits[i] = ~(bv1->bits[i]);
513
+ }
514
+ bv_recount(bv);
515
+ return bv;
516
+ }
517
+
518
+ BitVector *bv_not(BitVector *bv1)
519
+ {
520
+ return bv_not_i(bv_new(), bv1);
521
+ }
522
+
523
+ BitVector *bv_not_x(BitVector *bv1)
524
+ {
525
+ return bv_not_i(bv1, bv1);
526
+ }