ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/bitvector.c
CHANGED
@@ -1,168 +1,526 @@
|
|
1
|
-
#include
|
1
|
+
#include "bitvector.h"
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
-
BitVector *
|
4
|
+
BitVector *bv_new_capa(int capa)
|
5
5
|
{
|
6
|
-
|
6
|
+
BitVector *bv = ALLOC(BitVector);
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
/* The capacity passed by the user is number of bits allowed, however we
|
9
|
+
* store capacity as the number of words (U32) allocated. */
|
10
|
+
bv->capa = (capa >> 5) + 1;
|
11
|
+
bv->bits = ALLOC_AND_ZERO_N(f_u32, bv->capa);
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
bv->size = 0;
|
14
|
+
bv->count = 0;
|
15
|
+
bv->curr_bit = -1;
|
16
|
+
bv->extends_as_ones = 0;
|
17
|
+
return bv;
|
16
18
|
}
|
17
19
|
|
18
|
-
BitVector *
|
20
|
+
BitVector *bv_new()
|
19
21
|
{
|
20
|
-
|
22
|
+
return bv_new_capa(BV_INIT_CAPA);
|
21
23
|
}
|
22
24
|
|
23
|
-
void bv_destroy(BitVector *bv)
|
25
|
+
void bv_destroy(BitVector * bv)
|
24
26
|
{
|
25
|
-
|
26
|
-
|
27
|
+
free(bv->bits);
|
28
|
+
free(bv);
|
27
29
|
}
|
28
30
|
|
29
|
-
void bv_set(BitVector *bv, int bit)
|
31
|
+
void bv_set(BitVector * bv, int bit)
|
30
32
|
{
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
if (
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
33
|
+
f_u32 *word_p;
|
34
|
+
int word = bit >> 5;
|
35
|
+
f_u32 bitmask = 1 << (bit & 31);
|
36
|
+
|
37
|
+
/* Check to see if we need to grow the BitVector */
|
38
|
+
if (bit >= bv->size) {
|
39
|
+
bv->size = bit + 1; /* size is max range of bits set */
|
40
|
+
if (word >= bv->capa) {
|
41
|
+
int capa = bv->capa << 1;
|
42
|
+
while (capa <= word) {
|
43
|
+
capa <<= 1;
|
44
|
+
}
|
45
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
46
|
+
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
47
|
+
sizeof(f_u32) * (capa - bv->capa));
|
48
|
+
bv->capa = capa;
|
49
|
+
}
|
42
50
|
}
|
43
|
-
|
44
|
-
|
45
|
-
|
51
|
+
|
52
|
+
/* Set the required bit */
|
53
|
+
word_p = &(bv->bits[word]);
|
54
|
+
if ((bitmask & *word_p) == 0) {
|
55
|
+
bv->count++; /* update count */
|
56
|
+
*word_p |= bitmask;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
/*
|
61
|
+
* This method relies on the fact that enough space has been set for the bits
|
62
|
+
* to be set. You need to create the BitVector using bv_new_capa(capa) with
|
63
|
+
* a capacity larger than any bit being set.
|
64
|
+
*/
|
65
|
+
void bv_set_fast(BitVector * bv, int bit)
|
66
|
+
{
|
46
67
|
bv->count++;
|
47
|
-
|
48
|
-
|
68
|
+
bv->size = bit;
|
69
|
+
bv->bits[bit >> 5] |= 1 << (bit & 31);
|
49
70
|
}
|
50
71
|
|
51
|
-
int bv_get(BitVector *bv, int bit)
|
72
|
+
int bv_get(BitVector * bv, int bit)
|
52
73
|
{
|
53
|
-
|
54
|
-
|
55
|
-
|
74
|
+
/* out of range so return 0 because it can't have been set */
|
75
|
+
if (bit >= bv->size) {
|
76
|
+
return bv->extends_as_ones;
|
77
|
+
}
|
78
|
+
return (bv->bits[bit >> 5] >> (bit & 31)) & 0x01;
|
56
79
|
}
|
57
80
|
|
58
|
-
void bv_clear(BitVector *bv)
|
81
|
+
void bv_clear(BitVector * bv)
|
59
82
|
{
|
60
|
-
|
61
|
-
|
83
|
+
memset(bv->bits, 0, bv->capa * sizeof(f_u32));
|
84
|
+
bv->extends_as_ones = 0;
|
85
|
+
bv->count = 0;
|
86
|
+
bv->size = 0;
|
62
87
|
}
|
63
88
|
|
64
|
-
|
89
|
+
/*
|
90
|
+
* FIXME: if the top set bit is unset, size is not adjusted. This will not
|
91
|
+
* cause any bugs in this code but could cause problems if users are relying
|
92
|
+
* on the fact that size is accurate.
|
93
|
+
*/
|
94
|
+
void bv_unset(BitVector * bv, int bit)
|
95
|
+
{
|
96
|
+
f_u32 *word_p;
|
97
|
+
f_u32 bitmask;
|
98
|
+
int word = bit >> 5;
|
99
|
+
|
100
|
+
if (bit >= bv->size) {
|
101
|
+
bv->size = bit + 1; /* size is max range of bits set */
|
102
|
+
if (word >= bv->capa) {
|
103
|
+
int capa = bv->capa << 1;
|
104
|
+
|
105
|
+
while (capa <= word) {
|
106
|
+
capa <<= 1;
|
107
|
+
}
|
108
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
109
|
+
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
110
|
+
sizeof(f_u32) * (capa - bv->capa));
|
111
|
+
bv->capa = capa;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
word_p = &(bv->bits[word]);
|
116
|
+
bitmask = 1 << (bit & 31);
|
117
|
+
if ((bitmask & *word_p) > 0) {
|
118
|
+
bv->count--; /* update count */
|
119
|
+
*word_p &= ~bitmask;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
/* Table of bits per char. This table is used by the bv_recount method to
|
124
|
+
* optimize the counting of bits */
|
125
|
+
static const uchar BYTE_COUNTS[] = {
|
126
|
+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
127
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
128
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
129
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
130
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
131
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
132
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
133
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
134
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
135
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
136
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
137
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
138
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
139
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
140
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
141
|
+
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
142
|
+
};
|
143
|
+
|
144
|
+
int bv_recount(BitVector * bv)
|
65
145
|
{
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
146
|
+
/* if the vector has been modified */
|
147
|
+
int i, c = 0;
|
148
|
+
uchar *bytes = (uchar *)bv->bits; /* count by character */
|
149
|
+
const int num_bytes = (((bv->size >> 5) + 1) << 2);
|
150
|
+
if (bv->extends_as_ones) {
|
151
|
+
for (i = 0; i < num_bytes; i++) {
|
152
|
+
c += BYTE_COUNTS[~(bytes[i]) & 0xFF]; /* sum bits per char */
|
153
|
+
}
|
154
|
+
}
|
155
|
+
else {
|
156
|
+
for (i = 0; i < num_bytes; i++) {
|
157
|
+
c += BYTE_COUNTS[bytes[i]]; /* sum bits per char */
|
158
|
+
}
|
159
|
+
}
|
160
|
+
bv->count = c;
|
161
|
+
return c;
|
78
162
|
}
|
79
163
|
|
80
|
-
void
|
164
|
+
void bv_scan_reset(BitVector * bv)
|
81
165
|
{
|
82
|
-
|
83
|
-
os_write_vint(os, bv->size);
|
84
|
-
os_write_bytes(os, bv->bits, bv->size);
|
85
|
-
os_close(os);
|
166
|
+
bv->curr_bit = -1;
|
86
167
|
}
|
87
168
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
169
|
+
/* Table showing the number of trailing 0s in a char. This is used to optimize
|
170
|
+
* the bv_scan_next method. */
|
171
|
+
const int NUM_TRAILING_ZEROS[] = {
|
172
|
+
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
173
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
174
|
+
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
175
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
176
|
+
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
177
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
178
|
+
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
179
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
180
|
+
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
181
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
182
|
+
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
183
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
184
|
+
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
185
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
186
|
+
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
187
|
+
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
|
105
188
|
};
|
106
189
|
|
107
|
-
|
190
|
+
/*
|
191
|
+
* This method is highly optimized, hence the loop unrolling
|
192
|
+
*/
|
193
|
+
static inline int bv_get_1_offset(f_u32 word)
|
194
|
+
{
|
195
|
+
if (word & 0xff) {
|
196
|
+
return NUM_TRAILING_ZEROS[word & 0xff];
|
197
|
+
}
|
198
|
+
else {
|
199
|
+
word >>= 8;
|
200
|
+
if (word & 0xff) {
|
201
|
+
return NUM_TRAILING_ZEROS[word & 0xff] + 8;
|
202
|
+
}
|
203
|
+
else {
|
204
|
+
word >>= 8;
|
205
|
+
if (word & 0xff) {
|
206
|
+
return NUM_TRAILING_ZEROS[word & 0xff] + 16;
|
207
|
+
}
|
208
|
+
else {
|
209
|
+
word >>= 8;
|
210
|
+
return NUM_TRAILING_ZEROS[word & 0xff] + 24;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
}
|
215
|
+
/*
|
216
|
+
* second fastest;
|
217
|
+
*
|
218
|
+
* while ((inc = NUM_TRAILING_ZEROS[word & 0xff]) == 8) {
|
219
|
+
* word >>= 8;
|
220
|
+
* bit_pos += 8;
|
221
|
+
* }
|
222
|
+
*
|
223
|
+
* third fastest;
|
224
|
+
*
|
225
|
+
* bit_pos += inc;
|
226
|
+
* if ((word & 0xffff) == 0) {
|
227
|
+
* bit_pos += 16;
|
228
|
+
* word >>= 16;
|
229
|
+
* }
|
230
|
+
* if ((word & 0xff) == 0) {
|
231
|
+
* bit_pos += 8;
|
232
|
+
* word >>= 8;
|
233
|
+
* }
|
234
|
+
* bit_pos += NUM_TRAILING_ZEROS[word & 0xff];
|
235
|
+
*/
|
236
|
+
|
237
|
+
int bv_scan_next_from(BitVector * bv, register const int from)
|
238
|
+
{
|
239
|
+
register const f_u32 *const bits = bv->bits;
|
240
|
+
register const int word_size = (bv->size >> 5) + 1;
|
241
|
+
register int word_pos = from >> 5;
|
242
|
+
register int bit_pos = (from & 31);
|
243
|
+
register f_u32 word = bits[word_pos] >> bit_pos;
|
244
|
+
|
245
|
+
if (from >= bv->size) {
|
246
|
+
return -1;
|
247
|
+
}
|
248
|
+
if (word == 0) {
|
249
|
+
bit_pos = 0;
|
250
|
+
do {
|
251
|
+
word_pos++;
|
252
|
+
if (word_pos >= word_size) {
|
253
|
+
return -1;
|
254
|
+
}
|
255
|
+
} while (bits[word_pos] == 0);
|
256
|
+
word = bits[word_pos];
|
257
|
+
}
|
258
|
+
|
259
|
+
/* check the word a byte at a time as the NUM_TRAILING_ZEROS table would
|
260
|
+
* be too large for 32-bit integer or even a 16-bit integer */
|
261
|
+
bit_pos += bv_get_1_offset(word);
|
262
|
+
|
263
|
+
return bv->curr_bit = ((word_pos << 5) + bit_pos);
|
264
|
+
}
|
265
|
+
|
266
|
+
int bv_scan_next(BitVector * bv)
|
267
|
+
{
|
268
|
+
return bv_scan_next_from(bv, bv->curr_bit + 1);
|
269
|
+
}
|
270
|
+
|
271
|
+
int bv_scan_next_unset_from(BitVector * bv, register const int from)
|
272
|
+
{
|
273
|
+
register const f_u32 *const bits = bv->bits;
|
274
|
+
register const int word_size = (bv->size >> 5) + 1;
|
275
|
+
register int word_pos = from >> 5;
|
276
|
+
register int bit_pos = (from & 31);
|
277
|
+
register f_u32 word = ~(~(bits[word_pos]) >> bit_pos);
|
278
|
+
|
279
|
+
if (from >= bv->size) {
|
280
|
+
return -1;
|
281
|
+
}
|
282
|
+
if (word == 0xFFFFFFFF) {
|
283
|
+
bit_pos = 0;
|
284
|
+
do {
|
285
|
+
word_pos++;
|
286
|
+
if (word_pos >= word_size) {
|
287
|
+
return -1;
|
288
|
+
}
|
289
|
+
} while (bits[word_pos] == 0xFFFFFFFF);
|
290
|
+
word = bits[word_pos];
|
291
|
+
}
|
292
|
+
|
293
|
+
bit_pos += bv_get_1_offset(~word);
|
294
|
+
|
295
|
+
return bv->curr_bit = ((word_pos << 5) + bit_pos);
|
296
|
+
}
|
297
|
+
|
298
|
+
int bv_scan_next_unset(BitVector * bv)
|
299
|
+
{
|
300
|
+
return bv_scan_next_unset_from(bv, bv->curr_bit + 1);
|
301
|
+
}
|
302
|
+
|
303
|
+
int bv_eq(BitVector *bv1, BitVector *bv2)
|
304
|
+
{
|
305
|
+
if (bv1 == bv2) {
|
306
|
+
return true;
|
307
|
+
}
|
308
|
+
else if (bv1->extends_as_ones != bv2->extends_as_ones) {
|
309
|
+
return false;
|
310
|
+
}
|
311
|
+
else {
|
312
|
+
f_u32 *bits = bv1->bits;
|
313
|
+
f_u32 *bits2 = bv2->bits;
|
314
|
+
int min_size = min2(bv1->size, bv2->size);
|
315
|
+
int word_size = (min_size >> 5) + 1;
|
316
|
+
int ext_word_size = 0;
|
317
|
+
|
318
|
+
int i;
|
319
|
+
|
320
|
+
for (i = 0; i < word_size; i++) {
|
321
|
+
if (bits[i] != bits2[i]) {
|
322
|
+
return false;
|
323
|
+
}
|
324
|
+
}
|
325
|
+
if (bv1->size > min_size) {
|
326
|
+
bits = bv1->bits;
|
327
|
+
ext_word_size = (bv1->size >> 5) + 1;
|
328
|
+
}
|
329
|
+
else if (bv2->size > min_size) {
|
330
|
+
bits = bv2->bits;
|
331
|
+
ext_word_size = (bv2->size >> 5) + 1;
|
332
|
+
}
|
333
|
+
if (ext_word_size) {
|
334
|
+
const f_u32 expected = (bv1->extends_as_ones ? 0xFFFFFFFF : 0);
|
335
|
+
for (i = word_size; i < ext_word_size; i++) {
|
336
|
+
if (bits[i] != expected) {
|
337
|
+
return false;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
}
|
341
|
+
}
|
342
|
+
return true;
|
343
|
+
}
|
344
|
+
|
345
|
+
ulong bv_hash(BitVector *bv)
|
108
346
|
{
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
347
|
+
ulong hash = 0;
|
348
|
+
const f_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
|
349
|
+
int i;
|
350
|
+
for (i = (bv->size >> 5); i >= 0; i--) {
|
351
|
+
const f_u32 word = bv->bits[i];
|
352
|
+
if (word != empty_word) {
|
353
|
+
hash = (hash << 1) ^ word;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
hash = (hash << 1) | bv->extends_as_ones;
|
357
|
+
return hash;
|
358
|
+
}
|
359
|
+
|
360
|
+
static BitVector *bv_and_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
361
|
+
{
|
362
|
+
int i;
|
363
|
+
int min_size = min2(bv1->size, bv2->size);
|
364
|
+
int word_size = (min_size >> 5) + 1;
|
365
|
+
int capa = 4;
|
366
|
+
while (capa < word_size) {
|
367
|
+
capa <<= 1;
|
368
|
+
}
|
369
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
370
|
+
bv->capa = capa;
|
371
|
+
bv->size = min_size;
|
372
|
+
|
373
|
+
if (bv1->extends_as_ones && bv2->extends_as_ones) {
|
374
|
+
bv->extends_as_ones = true;
|
375
|
+
}
|
376
|
+
else {
|
377
|
+
bv->extends_as_ones = false;
|
378
|
+
}
|
379
|
+
|
380
|
+
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
381
|
+
sizeof(f_u32) * (capa - word_size));
|
382
|
+
|
383
|
+
for (i = 0; i < word_size; i++) {
|
384
|
+
bv->bits[i] = bv1->bits[i] & bv2->bits[i];
|
385
|
+
}
|
386
|
+
bv_recount(bv);
|
387
|
+
return bv;
|
388
|
+
}
|
389
|
+
|
390
|
+
BitVector *bv_and(BitVector *bv1, BitVector *bv2)
|
391
|
+
{
|
392
|
+
return bv_and_i(bv_new(), bv1, bv2);
|
116
393
|
}
|
117
394
|
|
118
|
-
BitVector *
|
395
|
+
BitVector *bv_and_x(BitVector *bv1, BitVector *bv2)
|
119
396
|
{
|
120
|
-
|
121
|
-
InStream *is = store->open_input(store, name);
|
122
|
-
bv->capa = bv->size = (int)is_read_vint(is);
|
123
|
-
bv->bits = ALLOC_N(uchar, bv->capa);
|
124
|
-
is_read_bytes(is, bv->bits, 0, bv->size);
|
125
|
-
is_close(is);
|
126
|
-
bv_count(bv);
|
127
|
-
return bv;
|
397
|
+
return bv_and_i(bv1, bv1, bv2);
|
128
398
|
}
|
129
399
|
|
130
|
-
void
|
400
|
+
static inline void bv_recapa(BitVector *bv, int new_capa)
|
131
401
|
{
|
132
|
-
|
402
|
+
if (bv->capa < new_capa) {
|
403
|
+
REALLOC_N(bv->bits, f_u32, new_capa);
|
404
|
+
memset(bv->bits + bv->capa, (bv->extends_as_ones ? 0xFF : 0),
|
405
|
+
sizeof(f_u32) * (new_capa - bv->capa));
|
406
|
+
bv->capa = new_capa;
|
407
|
+
}
|
133
408
|
}
|
134
409
|
|
135
|
-
|
410
|
+
static BitVector *bv_or_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
136
411
|
{
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
412
|
+
int i;
|
413
|
+
int max_size = max2(bv1->size, bv2->size);
|
414
|
+
int word_size = (max_size >> 5) + 1;
|
415
|
+
int capa = 4;
|
416
|
+
while (capa < word_size) {
|
417
|
+
capa <<= 1;
|
418
|
+
}
|
419
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
420
|
+
bv->capa = capa;
|
421
|
+
bv->size = max_size;
|
144
422
|
|
145
|
-
|
146
|
-
|
147
|
-
inc = 0;
|
148
|
-
bit = 1;
|
149
|
-
do {
|
150
|
-
byte_pos++;
|
151
|
-
if (byte_pos >= size) return -1;
|
152
|
-
} while (bits[byte_pos] == 0);
|
153
|
-
}
|
423
|
+
bv_recapa(bv1, capa);
|
424
|
+
bv_recapa(bv2, capa);
|
154
425
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
426
|
+
if (bv1->extends_as_ones || bv2->extends_as_ones) {
|
427
|
+
bv->extends_as_ones = true;
|
428
|
+
}
|
429
|
+
else {
|
430
|
+
bv->extends_as_ones = false;
|
431
|
+
}
|
432
|
+
|
433
|
+
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
434
|
+
sizeof(f_u32) * (capa - word_size));
|
160
435
|
|
161
|
-
|
436
|
+
for (i = 0; i < word_size; i++) {
|
437
|
+
bv->bits[i] = bv1->bits[i] | bv2->bits[i];
|
438
|
+
}
|
439
|
+
bv_recount(bv);
|
440
|
+
return bv;
|
162
441
|
}
|
163
442
|
|
164
|
-
|
443
|
+
BitVector *bv_or(BitVector *bv1, BitVector *bv2)
|
165
444
|
{
|
166
|
-
|
445
|
+
return bv_or_i(bv_new(), bv1, bv2);
|
167
446
|
}
|
168
447
|
|
448
|
+
BitVector *bv_or_x(BitVector *bv1, BitVector *bv2)
|
449
|
+
{
|
450
|
+
return bv_or_i(bv1, bv1, bv2);
|
451
|
+
}
|
452
|
+
|
453
|
+
static BitVector *bv_xor_i(BitVector *bv, BitVector *bv1, BitVector *bv2)
|
454
|
+
{
|
455
|
+
int i;
|
456
|
+
int max_size = max2(bv1->size, bv2->size);
|
457
|
+
int word_size = (max_size >> 5) + 1;
|
458
|
+
int capa = 4;
|
459
|
+
while (capa < word_size) {
|
460
|
+
capa <<= 1;
|
461
|
+
}
|
462
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
463
|
+
bv->capa = capa;
|
464
|
+
bv->size = max_size;
|
465
|
+
|
466
|
+
bv_recapa(bv1, capa);
|
467
|
+
bv_recapa(bv2, capa);
|
468
|
+
|
469
|
+
if (bv1->extends_as_ones != bv2->extends_as_ones) {
|
470
|
+
bv->extends_as_ones = true;
|
471
|
+
}
|
472
|
+
else {
|
473
|
+
bv->extends_as_ones = false;
|
474
|
+
}
|
475
|
+
|
476
|
+
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
477
|
+
sizeof(f_u32) * (capa - word_size));
|
478
|
+
|
479
|
+
for (i = 0; i < word_size; i++) {
|
480
|
+
bv->bits[i] = bv1->bits[i] ^ bv2->bits[i];
|
481
|
+
}
|
482
|
+
bv_recount(bv);
|
483
|
+
return bv;
|
484
|
+
}
|
485
|
+
|
486
|
+
BitVector *bv_xor(BitVector *bv1, BitVector *bv2)
|
487
|
+
{
|
488
|
+
return bv_xor_i(bv_new(), bv1, bv2);
|
489
|
+
}
|
490
|
+
|
491
|
+
BitVector *bv_xor_x(BitVector *bv1, BitVector *bv2)
|
492
|
+
{
|
493
|
+
return bv_xor_i(bv1, bv1, bv2);
|
494
|
+
}
|
495
|
+
|
496
|
+
static BitVector *bv_not_i(BitVector *bv, BitVector *bv1)
|
497
|
+
{
|
498
|
+
int i;
|
499
|
+
int word_size = (bv1->size >> 5) + 1;
|
500
|
+
int capa = 4;
|
501
|
+
while (capa < word_size) {
|
502
|
+
capa <<= 1;
|
503
|
+
}
|
504
|
+
REALLOC_N(bv->bits, f_u32, capa);
|
505
|
+
bv->capa = capa;
|
506
|
+
bv->size = bv1->size;
|
507
|
+
bv->extends_as_ones = 1 - bv1->extends_as_ones;
|
508
|
+
memset(bv->bits + word_size, (bv->extends_as_ones ? 0xFF : 0),
|
509
|
+
sizeof(f_u32) * (capa - word_size));
|
510
|
+
|
511
|
+
for (i = 0; i < word_size; i++) {
|
512
|
+
bv->bits[i] = ~(bv1->bits[i]);
|
513
|
+
}
|
514
|
+
bv_recount(bv);
|
515
|
+
return bv;
|
516
|
+
}
|
517
|
+
|
518
|
+
BitVector *bv_not(BitVector *bv1)
|
519
|
+
{
|
520
|
+
return bv_not_i(bv_new(), bv1);
|
521
|
+
}
|
522
|
+
|
523
|
+
BitVector *bv_not_x(BitVector *bv1)
|
524
|
+
{
|
525
|
+
return bv_not_i(bv1, bv1);
|
526
|
+
}
|