ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/bitvector.c CHANGED
@@ -1,168 +1,526 @@
1
- #include <bitvector.h>
1
+ #include "bitvector.h"
2
2
  #include <string.h>
3
3
 
4
- BitVector *bv_create_size(int size)
4
+ BitVector *bv_new_capa(int capa)
5
5
  {
6
- BitVector *bv = ALLOC(BitVector);
6
+ BitVector *bv = ALLOC(BitVector);
7
7
 
8
- bv->capa = (size >> 3) + 1;
9
- bv->bits = ALLOC_N(uchar, bv->capa);
10
- memset(bv->bits, 0, bv->capa);
8
+ /* The capacity passed by the user is number of bits allowed, however we
9
+ * store capacity as the number of words (U32) allocated. */
10
+ bv->capa = (capa >> 5) + 1;
11
+ bv->bits = ALLOC_AND_ZERO_N(f_u32, bv->capa);
11
12
 
12
- bv->size = 0;
13
- bv->count = 0;
14
- bv->curr_bit = -1;
15
- return bv;
13
+ bv->size = 0;
14
+ bv->count = 0;
15
+ bv->curr_bit = -1;
16
+ bv->extends_as_ones = 0;
17
+ return bv;
16
18
  }
17
19
 
18
- BitVector *bv_create()
20
+ BitVector *bv_new()
19
21
  {
20
- return bv_create_size(BV_INIT_CAPA);
22
+ return bv_new_capa(BV_INIT_CAPA);
21
23
  }
22
24
 
23
- void bv_destroy(BitVector *bv)
25
+ void bv_destroy(BitVector * bv)
24
26
  {
25
- free(bv->bits);
26
- free(bv);
27
+ free(bv->bits);
28
+ free(bv);
27
29
  }
28
30
 
29
- void bv_set(BitVector *bv, int bit)
31
+ void bv_set(BitVector * bv, int bit)
30
32
  {
31
- uchar *byte_p;
32
- int byte = bit>>3;
33
- uchar bitmask = 1<<(bit&7);
34
- if (bv->size <= byte) {
35
- bv->size = byte + 1;
36
- if (bv->size >= bv->capa) {
37
- int capa = bv->capa * 2;
38
- while (capa < bv->size) capa *= 2;
39
- REALLOC_N(bv->bits, uchar, capa);
40
- memset(bv->bits + bv->capa, 0, capa - bv->capa);
41
- bv->capa = capa;
33
+ f_u32 *word_p;
34
+ int word = bit >> 5;
35
+ f_u32 bitmask = 1 << (bit & 31);
36
+
37
+ /* Check to see if we need to grow the BitVector */
38
+ if (bit >= bv->size) {
39
+ bv->size = bit + 1; /* size is max range of bits set */
40
+ if (word >= bv->capa) {
41
+ int capa = bv->capa << 1;
42
+ while (capa <= word) {
43
+ capa <<= 1;
44
+ }
45
+ REALLOC_N(bv->bits, f_u32, capa);
46
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
47
+ sizeof(f_u32) * (capa - bv->capa));
48
+ bv->capa = capa;
49
+ }
42
50
  }
43
- }
44
- byte_p = &(bv->bits[byte]);
45
- if ((bitmask & *byte_p) == 0) {
51
+
52
+ /* Set the required bit */
53
+ word_p = &(bv->bits[word]);
54
+ if ((bitmask & *word_p) == 0) {
55
+ bv->count++; /* update count */
56
+ *word_p |= bitmask;
57
+ }
58
+ }
59
+
60
+ /*
61
+ * This method relies on the fact that enough space has been set for the bits
62
+ * to be set. You need to create the BitVector using bv_new_capa(capa) with
63
+ * a capacity larger than any bit being set.
64
+ */
65
+ void bv_set_fast(BitVector * bv, int bit)
66
+ {
46
67
  bv->count++;
47
- *byte_p |= bitmask;
48
- }
68
+ bv->size = bit;
69
+ bv->bits[bit >> 5] |= 1 << (bit & 31);
49
70
  }
50
71
 
51
- int bv_get(BitVector *bv, int bit)
72
+ int bv_get(BitVector * bv, int bit)
52
73
  {
53
- int byte = bit>>3;
54
- if (byte >= bv->size) return 0;
55
- return (bv->bits[byte]>>(bit&7))&1;
74
+ /* out of range so return 0 because it can't have been set */
75
+ if (bit >= bv->size) {
76
+ return bv->extends_as_ones;
77
+ }
78
+ return (bv->bits[bit >> 5] >> (bit & 31)) & 0x01;
56
79
  }
57
80
 
58
- void bv_clear(BitVector *bv)
81
+ void bv_clear(BitVector * bv)
59
82
  {
60
- memset(bv->bits, 0, bv->size);
61
- bv->count = 0;
83
+ memset(bv->bits, 0, bv->capa * sizeof(f_u32));
84
+ bv->extends_as_ones = 0;
85
+ bv->count = 0;
86
+ bv->size = 0;
62
87
  }
63
88
 
64
- void bv_unset(BitVector *bv, int bit)
89
+ /*
90
+ * FIXME: if the top set bit is unset, size is not adjusted. This will not
91
+ * cause any bugs in this code but could cause problems if users are relying
92
+ * on the fact that size is accurate.
93
+ */
94
+ void bv_unset(BitVector * bv, int bit)
95
+ {
96
+ f_u32 *word_p;
97
+ f_u32 bitmask;
98
+ int word = bit >> 5;
99
+
100
+ if (bit >= bv->size) {
101
+ bv->size = bit + 1; /* size is max range of bits set */
102
+ if (word >= bv->capa) {
103
+ int capa = bv->capa << 1;
104
+
105
+ while (capa <= word) {
106
+ capa <<= 1;
107
+ }
108
+ REALLOC_N(bv->bits, f_u32, capa);
109
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
110
+ sizeof(f_u32) * (capa - bv->capa));
111
+ bv->capa = capa;
112
+ }
113
+ }
114
+
115
+ word_p = &(bv->bits[word]);
116
+ bitmask = 1 << (bit & 31);
117
+ if ((bitmask & *word_p) > 0) {
118
+ bv->count--; /* update count */
119
+ *word_p &= ~bitmask;
120
+ }
121
+ }
122
+
123
+ /* Table of bits per char. This table is used by the bv_recount method to
124
+ * optimize the counting of bits */
125
+ static const uchar BYTE_COUNTS[] = {
126
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
127
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
128
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
129
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
131
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
132
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
133
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
134
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
135
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
136
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
137
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
138
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
139
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
140
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
141
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
142
+ };
143
+
144
+ int bv_recount(BitVector * bv)
65
145
  {
66
- uchar *byte_p;
67
- uchar bitmask;
68
- int byte = bit>>3;
69
-
70
- if (byte >= bv->size) return;
71
-
72
- byte_p = &(bv->bits[byte]);
73
- bitmask = 1<<(bit&7);
74
- if ((bitmask & *byte_p) > 0) {
75
- bv->count--;
76
- *byte_p &= ~bitmask;
77
- }
146
+ /* if the vector has been modified */
147
+ int i, c = 0;
148
+ uchar *bytes = (uchar *)bv->bits; /* count by character */
149
+ const int num_bytes = (((bv->size >> 5) + 1) << 2);
150
+ if (bv->extends_as_ones) {
151
+ for (i = 0; i < num_bytes; i++) {
152
+ c += BYTE_COUNTS[~(bytes[i]) & 0xFF]; /* sum bits per char */
153
+ }
154
+ }
155
+ else {
156
+ for (i = 0; i < num_bytes; i++) {
157
+ c += BYTE_COUNTS[bytes[i]]; /* sum bits per char */
158
+ }
159
+ }
160
+ bv->count = c;
161
+ return c;
78
162
  }
79
163
 
80
- void bv_write(BitVector *bv, Store *store, char *name)
164
+ void bv_scan_reset(BitVector * bv)
81
165
  {
82
- OutStream *os = store->create_output(store, name);
83
- os_write_vint(os, bv->size);
84
- os_write_bytes(os, bv->bits, bv->size);
85
- os_close(os);
166
+ bv->curr_bit = -1;
86
167
  }
87
168
 
88
- const uchar BYTE_COUNTS[] = { // table of bits/char
89
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
169
+ /* Table showing the number of trailing 0s in a char. This is used to optimize
170
+ * the bv_scan_next method. */
171
+ const int NUM_TRAILING_ZEROS[] = {
172
+ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
173
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
174
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
175
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
176
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
177
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
178
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
179
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
180
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
181
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
182
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
183
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
184
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
185
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
186
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
187
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
105
188
  };
106
189
 
107
- int bv_count(BitVector *bv)
190
+ /*
191
+ * This method is highly optimized, hence the loop unrolling
192
+ */
193
+ static inline int bv_get_1_offset(f_u32 word)
194
+ {
195
+ if (word & 0xff) {
196
+ return NUM_TRAILING_ZEROS[word & 0xff];
197
+ }
198
+ else {
199
+ word >>= 8;
200
+ if (word & 0xff) {
201
+ return NUM_TRAILING_ZEROS[word & 0xff] + 8;
202
+ }
203
+ else {
204
+ word >>= 8;
205
+ if (word & 0xff) {
206
+ return NUM_TRAILING_ZEROS[word & 0xff] + 16;
207
+ }
208
+ else {
209
+ word >>= 8;
210
+ return NUM_TRAILING_ZEROS[word & 0xff] + 24;
211
+ }
212
+ }
213
+ }
214
+ }
215
+ /*
216
+ * second fastest;
217
+ *
218
+ * while ((inc = NUM_TRAILING_ZEROS[word & 0xff]) == 8) {
219
+ * word >>= 8;
220
+ * bit_pos += 8;
221
+ * }
222
+ *
223
+ * third fastest;
224
+ *
225
+ * bit_pos += inc;
226
+ * if ((word & 0xffff) == 0) {
227
+ * bit_pos += 16;
228
+ * word >>= 16;
229
+ * }
230
+ * if ((word & 0xff) == 0) {
231
+ * bit_pos += 8;
232
+ * word >>= 8;
233
+ * }
234
+ * bit_pos += NUM_TRAILING_ZEROS[word & 0xff];
235
+ */
236
+
237
+ int bv_scan_next_from(BitVector * bv, register const int from)
238
+ {
239
+ register const f_u32 *const bits = bv->bits;
240
+ register const int word_size = (bv->size >> 5) + 1;
241
+ register int word_pos = from >> 5;
242
+ register int bit_pos = (from & 31);
243
+ register f_u32 word = bits[word_pos] >> bit_pos;
244
+
245
+ if (from >= bv->size) {
246
+ return -1;
247
+ }
248
+ if (word == 0) {
249
+ bit_pos = 0;
250
+ do {
251
+ word_pos++;
252
+ if (word_pos >= word_size) {
253
+ return -1;
254
+ }
255
+ } while (bits[word_pos] == 0);
256
+ word = bits[word_pos];
257
+ }
258
+
259
+ /* check the word a byte at a time as the NUM_TRAILING_ZEROS table would
260
+ * be too large for 32-bit integer or even a 16-bit integer */
261
+ bit_pos += bv_get_1_offset(word);
262
+
263
+ return bv->curr_bit = ((word_pos << 5) + bit_pos);
264
+ }
265
+
266
+ int bv_scan_next(BitVector * bv)
267
+ {
268
+ return bv_scan_next_from(bv, bv->curr_bit + 1);
269
+ }
270
+
271
+ int bv_scan_next_unset_from(BitVector * bv, register const int from)
272
+ {
273
+ register const f_u32 *const bits = bv->bits;
274
+ register const int word_size = (bv->size >> 5) + 1;
275
+ register int word_pos = from >> 5;
276
+ register int bit_pos = (from & 31);
277
+ register f_u32 word = ~(~(bits[word_pos]) >> bit_pos);
278
+
279
+ if (from >= bv->size) {
280
+ return -1;
281
+ }
282
+ if (word == 0xFFFFFFFF) {
283
+ bit_pos = 0;
284
+ do {
285
+ word_pos++;
286
+ if (word_pos >= word_size) {
287
+ return -1;
288
+ }
289
+ } while (bits[word_pos] == 0xFFFFFFFF);
290
+ word = bits[word_pos];
291
+ }
292
+
293
+ bit_pos += bv_get_1_offset(~word);
294
+
295
+ return bv->curr_bit = ((word_pos << 5) + bit_pos);
296
+ }
297
+
298
+ int bv_scan_next_unset(BitVector * bv)
299
+ {
300
+ return bv_scan_next_unset_from(bv, bv->curr_bit + 1);
301
+ }
302
+
303
+ int bv_eq(BitVector *bv1, BitVector *bv2)
304
+ {
305
+ if (bv1 == bv2) {
306
+ return true;
307
+ }
308
+ else if (bv1->extends_as_ones != bv2->extends_as_ones) {
309
+ return false;
310
+ }
311
+ else {
312
+ f_u32 *bits = bv1->bits;
313
+ f_u32 *bits2 = bv2->bits;
314
+ int min_size = min2(bv1->size, bv2->size);
315
+ int word_size = (min_size >> 5) + 1;
316
+ int ext_word_size = 0;
317
+
318
+ int i;
319
+
320
+ for (i = 0; i < word_size; i++) {
321
+ if (bits[i] != bits2[i]) {
322
+ return false;
323
+ }
324
+ }
325
+ if (bv1->size > min_size) {
326
+ bits = bv1->bits;
327
+ ext_word_size = (bv1->size >> 5) + 1;
328
+ }
329
+ else if (bv2->size > min_size) {
330
+ bits = bv2->bits;
331
+ ext_word_size = (bv2->size >> 5) + 1;
332
+ }
333
+ if (ext_word_size) {
334
+ const f_u32 expected = (bv1->extends_as_ones ? 0xFFFFFFFF : 0);
335
+ for (i = word_size; i < ext_word_size; i++) {
336
+ if (bits[i] != expected) {
337
+ return false;
338
+ }
339
+ }
340
+ }
341
+ }
342
+ return true;
343
+ }
344
+
345
+ ulong bv_hash(BitVector *bv)
108
346
  {
109
- // if the vector has been modified
110
- int i, c = 0;
111
- uchar *bytes = bv->bits;
112
- for (i = 0; i < bv->size; i++)
113
- c += BYTE_COUNTS[bytes[i]]; // sum bits per char
114
- bv->count = c;
115
- return c;
347
+ ulong hash = 0;
348
+ const f_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
349
+ int i;
350
+ for (i = (bv->size >> 5); i >= 0; i--) {
351
+ const f_u32 word = bv->bits[i];
352
+ if (word != empty_word) {
353
+ hash = (hash << 1) ^ word;
354
+ }
355
+ }
356
+ hash = (hash << 1) | bv->extends_as_ones;
357
+ return hash;
358
+ }
359
+
360
+ static BitVector *bv_and_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
361
+ {
362
+ int i;
363
+ int min_size = min2(bv1->size, bv2->size);
364
+ int word_size = (min_size >> 5) + 1;
365
+ int capa = 4;
366
+ while (capa < word_size) {
367
+ capa <<= 1;
368
+ }
369
+ REALLOC_N(bv->bits, f_u32, capa);
370
+ bv->capa = capa;
371
+ bv->size = min_size;
372
+
373
+ if (bv1->extends_as_ones && bv2->extends_as_ones) {
374
+ bv->extends_as_ones = true;
375
+ }
376
+ else {
377
+ bv->extends_as_ones = false;
378
+ }
379
+
380
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
381
+ sizeof(f_u32) * (capa - word_size));
382
+
383
+ for (i = 0; i < word_size; i++) {
384
+ bv->bits[i] = bv1->bits[i] & bv2->bits[i];
385
+ }
386
+ bv_recount(bv);
387
+ return bv;
388
+ }
389
+
390
+ BitVector *bv_and(BitVector *bv1, BitVector *bv2)
391
+ {
392
+ return bv_and_i(bv_new(), bv1, bv2);
116
393
  }
117
394
 
118
- BitVector *bv_read(Store *store, char *name)
395
+ BitVector *bv_and_x(BitVector *bv1, BitVector *bv2)
119
396
  {
120
- BitVector *bv = ALLOC(BitVector);
121
- InStream *is = store->open_input(store, name);
122
- bv->capa = bv->size = (int)is_read_vint(is);
123
- bv->bits = ALLOC_N(uchar, bv->capa);
124
- is_read_bytes(is, bv->bits, 0, bv->size);
125
- is_close(is);
126
- bv_count(bv);
127
- return bv;
397
+ return bv_and_i(bv1, bv1, bv2);
128
398
  }
129
399
 
130
- void bv_scan_reset(BitVector *bv)
400
+ static inline void bv_recapa(BitVector *bv, int new_capa)
131
401
  {
132
- bv->curr_bit = -1;
402
+ if (bv->capa < new_capa) {
403
+ REALLOC_N(bv->bits, f_u32, new_capa);
404
+ memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
405
+ sizeof(f_u32) * (new_capa - bv->capa));
406
+ bv->capa = new_capa;
407
+ }
133
408
  }
134
409
 
135
- inline int bv_scan_next_from(BitVector *bv, register const int from)
410
+ static BitVector *bv_or_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
136
411
  {
137
- register const uchar *const bits = bv->bits;
138
- register const int size = bv->size;
139
- register int byte_pos = (from) >> 3;
140
- register int inc = ((from) & 7);
141
- register int bit = 1 << inc;
142
- register int mask = 0xff << inc;
143
- register int byte;
412
+ int i;
413
+ int max_size = max2(bv1->size, bv2->size);
414
+ int word_size = (max_size >> 5) + 1;
415
+ int capa = 4;
416
+ while (capa < word_size) {
417
+ capa <<= 1;
418
+ }
419
+ REALLOC_N(bv->bits, f_u32, capa);
420
+ bv->capa = capa;
421
+ bv->size = max_size;
144
422
 
145
- if (byte_pos >= size) return -1;
146
- if ((bits[byte_pos]&mask) == 0) {
147
- inc = 0;
148
- bit = 1;
149
- do {
150
- byte_pos++;
151
- if (byte_pos >= size) return -1;
152
- } while (bits[byte_pos] == 0);
153
- }
423
+ bv_recapa(bv1, capa);
424
+ bv_recapa(bv2, capa);
154
425
 
155
- byte = bits[byte_pos];
156
- while ((byte & bit) == 0) {
157
- bit <<= 1;
158
- inc++;
159
- }
426
+ if (bv1->extends_as_ones || bv2->extends_as_ones) {
427
+ bv->extends_as_ones = true;
428
+ }
429
+ else {
430
+ bv->extends_as_ones = false;
431
+ }
432
+
433
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
434
+ sizeof(f_u32) * (capa - word_size));
160
435
 
161
- return bv->curr_bit = ((byte_pos << 3) + inc);
436
+ for (i = 0; i < word_size; i++) {
437
+ bv->bits[i] = bv1->bits[i] | bv2->bits[i];
438
+ }
439
+ bv_recount(bv);
440
+ return bv;
162
441
  }
163
442
 
164
- inline int bv_scan_next(BitVector *bv)
443
+ BitVector *bv_or(BitVector *bv1, BitVector *bv2)
165
444
  {
166
- return bv_scan_next_from(bv, bv->curr_bit+1);
445
+ return bv_or_i(bv_new(), bv1, bv2);
167
446
  }
168
447
 
448
+ BitVector *bv_or_x(BitVector *bv1, BitVector *bv2)
449
+ {
450
+ return bv_or_i(bv1, bv1, bv2);
451
+ }
452
+
453
+ static BitVector *bv_xor_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
454
+ {
455
+ int i;
456
+ int max_size = max2(bv1->size, bv2->size);
457
+ int word_size = (max_size >> 5) + 1;
458
+ int capa = 4;
459
+ while (capa < word_size) {
460
+ capa <<= 1;
461
+ }
462
+ REALLOC_N(bv->bits, f_u32, capa);
463
+ bv->capa = capa;
464
+ bv->size = max_size;
465
+
466
+ bv_recapa(bv1, capa);
467
+ bv_recapa(bv2, capa);
468
+
469
+ if (bv1->extends_as_ones != bv2->extends_as_ones) {
470
+ bv->extends_as_ones = true;
471
+ }
472
+ else {
473
+ bv->extends_as_ones = false;
474
+ }
475
+
476
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
477
+ sizeof(f_u32) * (capa - word_size));
478
+
479
+ for (i = 0; i < word_size; i++) {
480
+ bv->bits[i] = bv1->bits[i] ^ bv2->bits[i];
481
+ }
482
+ bv_recount(bv);
483
+ return bv;
484
+ }
485
+
486
+ BitVector *bv_xor(BitVector *bv1, BitVector *bv2)
487
+ {
488
+ return bv_xor_i(bv_new(), bv1, bv2);
489
+ }
490
+
491
+ BitVector *bv_xor_x(BitVector *bv1, BitVector *bv2)
492
+ {
493
+ return bv_xor_i(bv1, bv1, bv2);
494
+ }
495
+
496
+ static BitVector *bv_not_i(BitVector *bv, BitVector *bv1)
497
+ {
498
+ int i;
499
+ int word_size = (bv1->size >> 5) + 1;
500
+ int capa = 4;
501
+ while (capa < word_size) {
502
+ capa <<= 1;
503
+ }
504
+ REALLOC_N(bv->bits, f_u32, capa);
505
+ bv->capa = capa;
506
+ bv->size = bv1->size;
507
+ bv->extends_as_ones = 1 - bv1->extends_as_ones;
508
+ memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
509
+ sizeof(f_u32) * (capa - word_size));
510
+
511
+ for (i = 0; i < word_size; i++) {
512
+ bv->bits[i] = ~(bv1->bits[i]);
513
+ }
514
+ bv_recount(bv);
515
+ return bv;
516
+ }
517
+
518
+ BitVector *bv_not(BitVector *bv1)
519
+ {
520
+ return bv_not_i(bv_new(), bv1);
521
+ }
522
+
523
+ BitVector *bv_not_x(BitVector *bv1)
524
+ {
525
+ return bv_not_i(bv1, bv1);
526
+ }