ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/analysis.h
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef FRT_ANALYSIS_H
|
2
|
+
#define FRT_ANALYSIS_H
|
3
|
+
|
4
|
+
#include <global.h>
|
5
|
+
#include <hash.h>
|
6
|
+
|
7
|
+
/****************************************************************************
|
8
|
+
*
|
9
|
+
* Token
|
10
|
+
*
|
11
|
+
****************************************************************************/
|
12
|
+
|
13
|
+
typedef struct Token {
|
14
|
+
char text[MAX_WORD_SIZE];
|
15
|
+
int start;
|
16
|
+
int end;
|
17
|
+
int pos_inc;
|
18
|
+
} Token;
|
19
|
+
|
20
|
+
Token *tk_create();
|
21
|
+
void tk_destroy(void *p);
|
22
|
+
Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc);
|
23
|
+
Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc);
|
24
|
+
int tk_eq(Token *tk1, Token *tk2);
|
25
|
+
int tk_cmp(Token *tk1, Token *tk2);
|
26
|
+
|
27
|
+
/****************************************************************************
|
28
|
+
*
|
29
|
+
* TokenStream
|
30
|
+
*
|
31
|
+
****************************************************************************/
|
32
|
+
|
33
|
+
typedef struct TokenStream TokenStream;
|
34
|
+
struct TokenStream {
|
35
|
+
void *data;
|
36
|
+
char *text;
|
37
|
+
int pos;
|
38
|
+
Token *token;
|
39
|
+
Token *(*next)(TokenStream *ts);
|
40
|
+
void (*reset)(TokenStream *ts, char *text);
|
41
|
+
void (*destroy)(void *p);
|
42
|
+
TokenStream *sub_ts; // used by filters
|
43
|
+
};
|
44
|
+
|
45
|
+
#define ts_next(mts) mts->next(mts)
|
46
|
+
#define ts_destroy(mts) mts->destroy(mts)
|
47
|
+
|
48
|
+
TokenStream *whitespace_tokenizer_create();
|
49
|
+
TokenStream *letter_tokenizer_create();
|
50
|
+
TokenStream *standard_tokenizer_create();
|
51
|
+
TokenStream *lowercase_filter_create(TokenStream *ts);
|
52
|
+
TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len);
|
53
|
+
TokenStream *stop_filter_create(TokenStream *ts);
|
54
|
+
|
55
|
+
/****************************************************************************
|
56
|
+
*
|
57
|
+
* Analyzer
|
58
|
+
*
|
59
|
+
****************************************************************************/
|
60
|
+
|
61
|
+
typedef struct Analyzer {
|
62
|
+
void *data;
|
63
|
+
TokenStream *current_ts;
|
64
|
+
TokenStream *(*get_ts)(struct Analyzer *a, char *field, char *text);
|
65
|
+
void (*destroy)(void *p);
|
66
|
+
} Analyzer;
|
67
|
+
|
68
|
+
#define a_destroy(ma) ma->destroy(ma)
|
69
|
+
#define a_get_ts(ma, field, text) ma->get_ts(ma, field, text)
|
70
|
+
|
71
|
+
Analyzer *whitespace_analyzer_create();
|
72
|
+
Analyzer *letter_analyzer_create();
|
73
|
+
Analyzer *standard_analyzer_create();
|
74
|
+
Analyzer *standard_analyzer_create_with_words(char **words, int len);
|
75
|
+
|
76
|
+
#endif
|
data/ext/array.c
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#include <global.h>
|
2
|
+
#include <array.h>
|
3
|
+
#include <string.h>
|
4
|
+
|
5
|
+
Array *ary_create(int allocate, void (*free_elem)(void *p))
|
6
|
+
{
|
7
|
+
Array *ary = ALLOC(Array);
|
8
|
+
if (allocate == 0) {
|
9
|
+
ary->elems = NULL;
|
10
|
+
} else {
|
11
|
+
ary->elems = ALLOC_N(void *, allocate);
|
12
|
+
memset(ary->elems, 0, sizeof(void *) * allocate);
|
13
|
+
}
|
14
|
+
ary->size = 0;
|
15
|
+
ary->allocated = allocate;
|
16
|
+
ary->free_elem = free_elem;
|
17
|
+
|
18
|
+
return ary;
|
19
|
+
}
|
20
|
+
|
21
|
+
void ary_destroy(void *p)
|
22
|
+
{
|
23
|
+
Array *ary = (Array *)p;
|
24
|
+
int i;
|
25
|
+
for (i = 0; i < ary->size; i++) {
|
26
|
+
if (ary->free_elem != NULL && ary->elems[i] != NULL)
|
27
|
+
ary->free_elem(ary->elems[i]);
|
28
|
+
}
|
29
|
+
free(ary->elems);
|
30
|
+
free(ary);
|
31
|
+
}
|
32
|
+
|
33
|
+
void ary_set(Array *ary, int index, void *value)
|
34
|
+
{
|
35
|
+
if (index >= ary->allocated) {
|
36
|
+
ary->allocated = (index + 1)*2;
|
37
|
+
REALLOC_N(ary->elems, void *, (ary->allocated));
|
38
|
+
memset((&ary->elems[ary->size]), 0, sizeof(void *) * (ary->allocated - ary->size));
|
39
|
+
}
|
40
|
+
|
41
|
+
if (index >= ary->size)
|
42
|
+
ary->size = index + 1;
|
43
|
+
|
44
|
+
if (ary->free_elem != NULL && ary->elems[index] != NULL)
|
45
|
+
ary->free_elem(ary->elems[index]);
|
46
|
+
|
47
|
+
ary->elems[index] = value;
|
48
|
+
}
|
49
|
+
|
50
|
+
void ary_append(Array *ary, void *value)
|
51
|
+
{
|
52
|
+
ary_set(ary, ary->size, value);
|
53
|
+
}
|
54
|
+
|
55
|
+
void *ary_get(Array *ary, int index)
|
56
|
+
{
|
57
|
+
if (index >= ary->size)
|
58
|
+
return NULL;
|
59
|
+
return ary->elems[index];
|
60
|
+
}
|
61
|
+
|
62
|
+
void ary_delete(Array *ary, int index)
|
63
|
+
{
|
64
|
+
if (index >= ary->size)
|
65
|
+
return;
|
66
|
+
ary->free_elem(ary->elems[index]);
|
67
|
+
ary->elems[index] = NULL;
|
68
|
+
if (index == ary->size - 1)
|
69
|
+
ary->size--;
|
70
|
+
}
|
71
|
+
|
72
|
+
void *ary_remove(Array *ary, int index)
|
73
|
+
{
|
74
|
+
void *p;
|
75
|
+
if (index >= ary->size)
|
76
|
+
return NULL;
|
77
|
+
p = ary->elems[index];
|
78
|
+
ary->elems[index] = NULL;
|
79
|
+
ary->size--;
|
80
|
+
memmove(&ary->elems[index], &ary->elems[index + 1],
|
81
|
+
sizeof(void *) *(ary->size - index));
|
82
|
+
return p;
|
83
|
+
}
|
data/ext/array.h
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef FRT_ARRAY_H
|
2
|
+
#define FRT_ARRAY_H
|
3
|
+
|
4
|
+
typedef struct Array {
|
5
|
+
void **elems;
|
6
|
+
int size;
|
7
|
+
int allocated;
|
8
|
+
void (*free_elem)(void *p);
|
9
|
+
} Array;
|
10
|
+
|
11
|
+
Array *ary_create(int size, void (*free_elem)(void *p));
|
12
|
+
void ary_destroy(void *p);
|
13
|
+
void ary_set(Array *ary, int index, void *value);
|
14
|
+
void ary_append(Array *ary, void *value);
|
15
|
+
void *ary_get(Array *ary, int index);
|
16
|
+
void ary_delete(Array *ary, int index);
|
17
|
+
void *ary_remove(Array *ary, int index);
|
18
|
+
|
19
|
+
#endif
|
data/ext/bitvector.c
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
#include <bitvector.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
BitVector *bv_create_size(int size)
|
5
|
+
{
|
6
|
+
BitVector *bv = ALLOC(BitVector);
|
7
|
+
|
8
|
+
bv->capa = (size >> 3) + 1;
|
9
|
+
bv->bits = ALLOC_N(uchar, bv->capa);
|
10
|
+
memset(bv->bits, 0, bv->capa);
|
11
|
+
|
12
|
+
bv->size = 0;
|
13
|
+
bv->count = 0;
|
14
|
+
bv->curr_bit = -1;
|
15
|
+
return bv;
|
16
|
+
}
|
17
|
+
|
18
|
+
BitVector *bv_create()
|
19
|
+
{
|
20
|
+
return bv_create_size(BV_INIT_CAPA);
|
21
|
+
}
|
22
|
+
|
23
|
+
void bv_destroy(void *p)
|
24
|
+
{
|
25
|
+
BitVector *bv = (BitVector *)p;
|
26
|
+
free(bv->bits);
|
27
|
+
free(bv);
|
28
|
+
}
|
29
|
+
|
30
|
+
void bv_set(BitVector *bv, int bit)
|
31
|
+
{
|
32
|
+
int byte = bit>>3;
|
33
|
+
uchar bitmask = 1<<(bit&7);
|
34
|
+
if (bv->size <= byte) {
|
35
|
+
bv->size = byte + 1;
|
36
|
+
if (bv->size >= bv->capa) {
|
37
|
+
int capa = bv->capa * 2;
|
38
|
+
while (capa < bv->size) capa *= 2;
|
39
|
+
REALLOC_N(bv->bits, uchar, capa);
|
40
|
+
memset(bv->bits + bv->capa, 0, capa - bv->capa);
|
41
|
+
bv->capa = capa;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
uchar *byte_p = &(bv->bits[byte]);
|
45
|
+
if ((bitmask & *byte_p) == 0) {
|
46
|
+
bv->count++;
|
47
|
+
*byte_p |= bitmask;
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
int bv_get(BitVector *bv, int bit)
|
52
|
+
{
|
53
|
+
int byte = bit>>3;
|
54
|
+
if (byte >= bv->size) return 0;
|
55
|
+
return (bv->bits[byte]>>(bit&7))&1;
|
56
|
+
}
|
57
|
+
|
58
|
+
void bv_clear(BitVector *bv)
|
59
|
+
{
|
60
|
+
memset(bv->bits, 0, bv->size);
|
61
|
+
bv->count = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
void bv_unset(BitVector *bv, int bit)
|
65
|
+
{
|
66
|
+
int byte = bit>>3;
|
67
|
+
if (byte >= bv->size) return;
|
68
|
+
uchar *byte_p = &(bv->bits[byte]);
|
69
|
+
uchar bitmask = 1<<(bit&7);
|
70
|
+
if ((bitmask & *byte_p) > 0) {
|
71
|
+
bv->count--;
|
72
|
+
*byte_p &= ~bitmask;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
void bv_write(BitVector *bv, Store *store, char *name)
|
77
|
+
{
|
78
|
+
OutStream *os = store->create_output(store, name);
|
79
|
+
os_write_vint(os, bv->size);
|
80
|
+
os_write_bytes(os, bv->bits, bv->size);
|
81
|
+
os_close(os);
|
82
|
+
}
|
83
|
+
|
84
|
+
const uchar BYTE_COUNTS[] = { // table of bits/char
|
85
|
+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
86
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
87
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
88
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
89
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
90
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
91
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
92
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
93
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
94
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
95
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
96
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
97
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
98
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
99
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
100
|
+
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
101
|
+
};
|
102
|
+
|
103
|
+
int bv_count(BitVector *bv)
|
104
|
+
{
|
105
|
+
// if the vector has been modified
|
106
|
+
int i, c = 0;
|
107
|
+
uchar *bytes = bv->bits;
|
108
|
+
for (i = 0; i < bv->size; i++)
|
109
|
+
c += BYTE_COUNTS[bytes[i]]; // sum bits per char
|
110
|
+
bv->count = c;
|
111
|
+
return c;
|
112
|
+
}
|
113
|
+
|
114
|
+
BitVector *bv_read(Store *store, char *name)
|
115
|
+
{
|
116
|
+
BitVector *bv = ALLOC(BitVector);
|
117
|
+
InStream *is = store->open_input(store, name);
|
118
|
+
bv->capa = bv->size = is_read_vint(is);
|
119
|
+
bv->bits = ALLOC_N(uchar, bv->capa);
|
120
|
+
is_read_bytes(is, bv->bits, 0, bv->size);
|
121
|
+
is_close(is);
|
122
|
+
bv_count(bv);
|
123
|
+
return bv;
|
124
|
+
}
|
125
|
+
|
126
|
+
void bv_scan_reset(BitVector *bv)
|
127
|
+
{
|
128
|
+
bv->curr_bit = -1;
|
129
|
+
}
|
130
|
+
|
131
|
+
inline int bv_scan_next_from(BitVector *bv, register const int from)
|
132
|
+
{
|
133
|
+
register const uchar *const bits = bv->bits;
|
134
|
+
register const int size = bv->size;
|
135
|
+
register int byte_pos = (from) >> 3;
|
136
|
+
register int inc = ((from) & 7);
|
137
|
+
register int bit = 1 << inc;
|
138
|
+
register int mask = 0xff << inc;
|
139
|
+
register int byte;
|
140
|
+
|
141
|
+
if (byte_pos >= size) return -1;
|
142
|
+
if ((bits[byte_pos]&mask) == 0) {
|
143
|
+
inc = 0;
|
144
|
+
bit = 1;
|
145
|
+
do {
|
146
|
+
byte_pos++;
|
147
|
+
if (byte_pos >= size) return -1;
|
148
|
+
} while (bits[byte_pos] == 0);
|
149
|
+
}
|
150
|
+
|
151
|
+
byte = bits[byte_pos];
|
152
|
+
while ((byte & bit) == 0) {
|
153
|
+
bit <<= 1;
|
154
|
+
inc++;
|
155
|
+
}
|
156
|
+
|
157
|
+
return bv->curr_bit = ((byte_pos << 3) + inc);
|
158
|
+
}
|
159
|
+
|
160
|
+
inline int bv_scan_next(BitVector *bv)
|
161
|
+
{
|
162
|
+
return bv_scan_next_from(bv, bv->curr_bit+1);
|
163
|
+
}
|
164
|
+
|
data/ext/bitvector.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#ifndef FRT_BIT_VECTOR_H
|
2
|
+
#define FRT_BIT_VECTOR_H
|
3
|
+
|
4
|
+
#include <global.h>
|
5
|
+
#include <store.h>
|
6
|
+
|
7
|
+
#define BV_INIT_CAPA 256
|
8
|
+
typedef struct BitVector {
|
9
|
+
uchar *bits;
|
10
|
+
int size;
|
11
|
+
int capa;
|
12
|
+
int count;
|
13
|
+
int curr_bit;
|
14
|
+
} BitVector;
|
15
|
+
|
16
|
+
BitVector *bv_create();;
|
17
|
+
BitVector *bv_create_size(int size);
|
18
|
+
void bv_destroy(void *bv);
|
19
|
+
void bv_set(BitVector *bv, int bit);
|
20
|
+
int bv_get(BitVector *bv, int bit);
|
21
|
+
void bv_clear(BitVector *bv);
|
22
|
+
void bv_unset(BitVector *bv, int bit);
|
23
|
+
void bv_write(BitVector *bv, Store *store, char *name);
|
24
|
+
BitVector *bv_read(Store *store, char *name);
|
25
|
+
void bv_scan_reset(BitVector *bv);
|
26
|
+
int bv_scan_next(BitVector *bv);
|
27
|
+
int bv_scan_next_from(BitVector *bv, int from);
|
28
|
+
|
29
|
+
#endif
|
data/ext/compound_io.c
ADDED
@@ -0,0 +1,335 @@
|
|
1
|
+
#include "index.h"
|
2
|
+
|
3
|
+
/****************************************************************************
|
4
|
+
*
|
5
|
+
* CompoundStore
|
6
|
+
*
|
7
|
+
****************************************************************************/
|
8
|
+
|
9
|
+
typedef struct FileEntry {
|
10
|
+
int offset;
|
11
|
+
int length;
|
12
|
+
} FileEntry;
|
13
|
+
|
14
|
+
void cmpd_touch(Store *store, char *filename)
|
15
|
+
{
|
16
|
+
store->dir.cmpd->store->touch(store->dir.cmpd->store, filename);
|
17
|
+
}
|
18
|
+
|
19
|
+
int cmpd_exists(Store *store, char *filename)
|
20
|
+
{
|
21
|
+
if (h_get(store->dir.cmpd->entries, filename) != NULL)
|
22
|
+
return true;
|
23
|
+
else
|
24
|
+
return false;
|
25
|
+
}
|
26
|
+
|
27
|
+
int cmpd_remove(Store *store, char *filename)
|
28
|
+
{
|
29
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
30
|
+
return 0;
|
31
|
+
}
|
32
|
+
|
33
|
+
int cmpd_rename(Store *store, char *from, char *to)
|
34
|
+
{
|
35
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
36
|
+
return 0;
|
37
|
+
}
|
38
|
+
|
39
|
+
int cmpd_count(Store *store)
|
40
|
+
{
|
41
|
+
return store->dir.cmpd->entries->used;
|
42
|
+
}
|
43
|
+
|
44
|
+
void cmpd_clear(Store *store)
|
45
|
+
{
|
46
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
47
|
+
}
|
48
|
+
|
49
|
+
void cmpd_close(Store *store)
|
50
|
+
{
|
51
|
+
mutex_lock(&store->mutex);
|
52
|
+
CompoundStore *cmpd = store->dir.cmpd;
|
53
|
+
if (cmpd->stream == NULL)
|
54
|
+
eprintf(IO_ERROR, "Already closed");
|
55
|
+
|
56
|
+
h_destroy(cmpd->entries);
|
57
|
+
|
58
|
+
is_close(cmpd->stream);
|
59
|
+
cmpd->stream = NULL;
|
60
|
+
free(store->dir.cmpd);
|
61
|
+
store_destroy(store);
|
62
|
+
}
|
63
|
+
|
64
|
+
int cmpd_length(Store *store, char *filename)
|
65
|
+
{
|
66
|
+
FileEntry *fe = (FileEntry *)h_get(store->dir.cmpd->entries, filename);
|
67
|
+
if (fe != NULL)
|
68
|
+
return fe->length;
|
69
|
+
else
|
70
|
+
return 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
void cmpdi_seek_internal(InStream *is, int pos) {}
|
74
|
+
void cmpdi_close_internal(InStream *is)
|
75
|
+
{
|
76
|
+
//is_close(is->d.cis->sub);
|
77
|
+
free(is->d.cis);
|
78
|
+
}
|
79
|
+
|
80
|
+
void cmpdi_clone_internal(InStream *is, InStream *new_is)
|
81
|
+
{
|
82
|
+
CompoundInStream *cis = ALLOC(CompoundInStream);
|
83
|
+
//cis->sub = is_clone(is->d.cis->sub);
|
84
|
+
cis->sub = is->d.cis->sub;
|
85
|
+
cis->offset = is->d.cis->offset;
|
86
|
+
cis->length = is->d.cis->length;
|
87
|
+
new_is->d.cis = cis;
|
88
|
+
}
|
89
|
+
|
90
|
+
int cmpdi_length_internal(InStream *is)
|
91
|
+
{
|
92
|
+
return (is->d.cis->length);
|
93
|
+
}
|
94
|
+
|
95
|
+
void cmpdi_read_internal(InStream *is, uchar *b, int offset, int len)
|
96
|
+
{
|
97
|
+
CompoundInStream *cis = is->d.cis;
|
98
|
+
int start = is_pos(is);
|
99
|
+
if ((start + len) > cis->length)
|
100
|
+
eprintf(EOF_ERROR, "read past EOF");
|
101
|
+
is_seek(cis->sub, cis->offset + start);
|
102
|
+
is_read_bytes(cis->sub, b, offset, len);
|
103
|
+
}
|
104
|
+
|
105
|
+
InStream *cmpd_create_input(InStream *sub_is, int offset, int length)
|
106
|
+
{
|
107
|
+
InStream *is = is_create();
|
108
|
+
CompoundInStream *cis = ALLOC(CompoundInStream);
|
109
|
+
//cis->sub = is_clone(sub_is);
|
110
|
+
cis->sub = sub_is;
|
111
|
+
cis->offset = offset;
|
112
|
+
cis->length = length;
|
113
|
+
is->d.cis = cis;
|
114
|
+
is->file = NULL;
|
115
|
+
|
116
|
+
is->read_internal = &cmpdi_read_internal;
|
117
|
+
is->seek_internal = &cmpdi_seek_internal;
|
118
|
+
is->close_internal = &cmpdi_close_internal;
|
119
|
+
is->clone_internal = &cmpdi_clone_internal;
|
120
|
+
is->length_internal = &cmpdi_length_internal;
|
121
|
+
return is;
|
122
|
+
}
|
123
|
+
|
124
|
+
InStream *cmpd_open_input(Store *store, const char *filename)
|
125
|
+
{
|
126
|
+
CompoundStore *cmpd = store->dir.cmpd;
|
127
|
+
InStream *is;
|
128
|
+
|
129
|
+
mutex_lock(&store->mutex);
|
130
|
+
if (cmpd->stream == NULL) {
|
131
|
+
mutex_unlock(&store->mutex);
|
132
|
+
eprintf(IO_ERROR, "Stream closed");
|
133
|
+
}
|
134
|
+
|
135
|
+
FileEntry *entry = (FileEntry *)h_get(cmpd->entries, filename);
|
136
|
+
if (entry == NULL) {
|
137
|
+
mutex_unlock(&store->mutex);
|
138
|
+
eprintf(IO_ERROR, "No sub-file with id <%s> found", filename);
|
139
|
+
}
|
140
|
+
|
141
|
+
is = cmpd_create_input(cmpd->stream, entry->offset, entry->length);
|
142
|
+
mutex_unlock(&store->mutex);
|
143
|
+
|
144
|
+
return is;
|
145
|
+
}
|
146
|
+
|
147
|
+
OutStream *cmpd_create_output(Store *store, const char *filename)
|
148
|
+
{
|
149
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
150
|
+
return NULL;
|
151
|
+
}
|
152
|
+
|
153
|
+
Lock *cmpd_open_lock(Store *store, char *lockname)
|
154
|
+
{
|
155
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
156
|
+
return NULL;
|
157
|
+
}
|
158
|
+
|
159
|
+
void cmpd_close_lock(Lock *lock)
|
160
|
+
{
|
161
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported operation");
|
162
|
+
}
|
163
|
+
|
164
|
+
Store *open_cmpd_store(Store *store, const char *name)
|
165
|
+
{
|
166
|
+
CompoundStore *cmpd = ALLOC(CompoundStore);
|
167
|
+
Store *new_store = store_create();
|
168
|
+
|
169
|
+
cmpd->store = store;
|
170
|
+
cmpd->name = name;
|
171
|
+
cmpd->entries = h_new_str(&efree, &efree);
|
172
|
+
InStream *is = cmpd->stream = store->open_input(store, cmpd->name);
|
173
|
+
|
174
|
+
// read the directory and init files
|
175
|
+
int count = is_read_vint(is);
|
176
|
+
FileEntry *entry = NULL;
|
177
|
+
int i, offset;
|
178
|
+
char *fname;
|
179
|
+
for (i = 0; i < count; i++) {
|
180
|
+
offset = is_read_long(is);
|
181
|
+
fname = is_read_string(is);
|
182
|
+
|
183
|
+
if (entry != NULL) {
|
184
|
+
// set length of the previous entry
|
185
|
+
entry->length = offset - entry->offset;
|
186
|
+
}
|
187
|
+
|
188
|
+
entry = ALLOC(FileEntry);
|
189
|
+
entry->offset = offset;
|
190
|
+
h_set(cmpd->entries, fname, entry);
|
191
|
+
}
|
192
|
+
|
193
|
+
// set the length of the final entry
|
194
|
+
if (entry != NULL)
|
195
|
+
entry->length = is_length(is) - entry->offset;
|
196
|
+
|
197
|
+
new_store->dir.cmpd = cmpd;
|
198
|
+
new_store->touch = &cmpd_touch;
|
199
|
+
new_store->exists = &cmpd_exists;
|
200
|
+
new_store->remove = &cmpd_remove;
|
201
|
+
new_store->rename = &cmpd_rename;
|
202
|
+
new_store->count = &cmpd_count;
|
203
|
+
new_store->close = &cmpd_close;
|
204
|
+
new_store->clear = &cmpd_clear;
|
205
|
+
new_store->length = &cmpd_length;
|
206
|
+
new_store->create_output = &cmpd_create_output;
|
207
|
+
new_store->open_input = &cmpd_open_input;
|
208
|
+
new_store->open_lock = &cmpd_open_lock;
|
209
|
+
new_store->close_lock = &cmpd_close_lock;
|
210
|
+
return new_store;
|
211
|
+
}
|
212
|
+
|
213
|
+
/****************************************************************************
|
214
|
+
*
|
215
|
+
* CompoundWriter
|
216
|
+
*
|
217
|
+
****************************************************************************/
|
218
|
+
|
219
|
+
typedef struct WFileEntry {
|
220
|
+
char *name;
|
221
|
+
int dir_offset;
|
222
|
+
int data_offset;
|
223
|
+
} WFileEntry;
|
224
|
+
|
225
|
+
WFileEntry *wfe_create(char *name)
|
226
|
+
{
|
227
|
+
WFileEntry *wfe = ALLOC(WFileEntry);
|
228
|
+
wfe->name = name;
|
229
|
+
return wfe;
|
230
|
+
}
|
231
|
+
|
232
|
+
void wfe_destroy(void *p)
|
233
|
+
{
|
234
|
+
WFileEntry *wfe = (WFileEntry *)p;
|
235
|
+
efree(wfe);
|
236
|
+
}
|
237
|
+
|
238
|
+
CompoundWriter *open_cw(Store *store, char *name)
|
239
|
+
{
|
240
|
+
CompoundWriter *cw = ALLOC(CompoundWriter);
|
241
|
+
cw->store = store;
|
242
|
+
cw->name = name;
|
243
|
+
cw->ids = hs_str_create(NULL);
|
244
|
+
cw->file_entries = ary_create(1, &wfe_destroy);
|
245
|
+
cw->merged = false;
|
246
|
+
return cw;
|
247
|
+
}
|
248
|
+
|
249
|
+
void cw_add_file(CompoundWriter *cw, char *id)
|
250
|
+
{
|
251
|
+
if (cw->merged) eprintf(STATE_ERROR, "Already merged");
|
252
|
+
if (hs_add(cw->ids, id) != HASH_KEY_DOES_NOT_EXIST)
|
253
|
+
eprintf(STATE_ERROR, "Already merged");
|
254
|
+
|
255
|
+
hs_add(cw->ids, id);
|
256
|
+
ary_append(cw->file_entries, wfe_create(id));
|
257
|
+
}
|
258
|
+
|
259
|
+
void cw_copy_file(CompoundWriter *cw, WFileEntry *src, OutStream *os)
|
260
|
+
{
|
261
|
+
|
262
|
+
int start_ptr = os_pos(os);
|
263
|
+
|
264
|
+
InStream *is = cw->store->open_input(cw->store, src->name);
|
265
|
+
int remainder, length, len;
|
266
|
+
remainder = length = is_length(is);
|
267
|
+
|
268
|
+
uchar buffer[BUFFER_SIZE];
|
269
|
+
while (remainder > 0) {
|
270
|
+
len = MIN(remainder, BUFFER_SIZE);
|
271
|
+
is_read_bytes(is, buffer, 0, len);
|
272
|
+
os_write_bytes(os, buffer, len);
|
273
|
+
remainder -= len;
|
274
|
+
}
|
275
|
+
|
276
|
+
// Verify that remainder is 0
|
277
|
+
if (remainder != 0)
|
278
|
+
eprintf(IO_ERROR, "Non-zero remainder length after copying: %ld "
|
279
|
+
"(id:%s, length: %ld, buffer size: %ld\n", remainder,
|
280
|
+
src->name, length, BUFFER_SIZE);
|
281
|
+
|
282
|
+
// Verify that the output length diff is equal to original file
|
283
|
+
int end_ptr = os_pos(os);
|
284
|
+
int diff = end_ptr - start_ptr;
|
285
|
+
if (diff != length)
|
286
|
+
eprintf(IO_ERROR, "Difference in the output file offsets %ld "
|
287
|
+
" does not match the original file length ", diff, length);
|
288
|
+
|
289
|
+
is_close(is);
|
290
|
+
}
|
291
|
+
|
292
|
+
void cw_close(CompoundWriter *cw)
|
293
|
+
{
|
294
|
+
if (cw->merged) eprintf(STATE_ERROR, "Already merged");
|
295
|
+
if (cw->ids->size <= 0)
|
296
|
+
eprintf(STATE_ERROR, "No Files to merge into the compound file");
|
297
|
+
|
298
|
+
cw->merged = true;
|
299
|
+
|
300
|
+
OutStream *os = cw->store->create_output(cw->store, cw->name);
|
301
|
+
os_write_vint(os, cw->file_entries->size);
|
302
|
+
|
303
|
+
/* Write the directory with all offsets at 0.
|
304
|
+
* Remember the positions of directory entries so that we can adjust the
|
305
|
+
* offsets later */
|
306
|
+
int i;
|
307
|
+
WFileEntry *wfe;
|
308
|
+
for (i = 0; i < cw->file_entries->size; i++) {
|
309
|
+
wfe = (WFileEntry *)cw->file_entries->elems[i];
|
310
|
+
wfe->dir_offset = os_pos(os);
|
311
|
+
os_write_long(os, 0); // for now
|
312
|
+
os_write_string(os, wfe->name);
|
313
|
+
}
|
314
|
+
|
315
|
+
/* Open the files and copy their data into the stream. Remember the
|
316
|
+
* locations of each file's data section. */
|
317
|
+
for (i = 0; i < cw->file_entries->size; i++) {
|
318
|
+
wfe = (WFileEntry *)cw->file_entries->elems[i];
|
319
|
+
wfe->data_offset = os_pos(os);
|
320
|
+
cw_copy_file(cw, wfe, os);
|
321
|
+
}
|
322
|
+
|
323
|
+
/* Write the data offsets into the directory of the compound stream */
|
324
|
+
for (i = 0; i < cw->file_entries->size; i++) {
|
325
|
+
wfe = (WFileEntry *)cw->file_entries->elems[i];
|
326
|
+
os_seek(os, wfe->dir_offset);
|
327
|
+
os_write_long(os, wfe->data_offset);
|
328
|
+
}
|
329
|
+
|
330
|
+
os_close(os);
|
331
|
+
hs_destroy(cw->ids);
|
332
|
+
ary_destroy(cw->file_entries);
|
333
|
+
free(cw);
|
334
|
+
}
|
335
|
+
|