ferret 0.11.5 → 0.11.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/analysis.c +12 -12
- data/ext/analysis.h +4 -4
- data/ext/index.c +11 -9
- data/ext/r_index.c +2 -2
- data/ext/store.c +49 -0
- data/ext/store.h +25 -0
- data/lib/ferret_version.rb +1 -1
- metadata +2 -2
data/ext/analysis.c
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
****************************************************************************/
|
|
14
14
|
|
|
15
15
|
INLINE Token *tk_set(Token *tk,
|
|
16
|
-
char *text, int tlen,
|
|
16
|
+
char *text, int tlen, off_t start, off_t end, int pos_inc)
|
|
17
17
|
{
|
|
18
18
|
if (tlen >= MAX_WORD_SIZE) {
|
|
19
19
|
tlen = MAX_WORD_SIZE - 1;
|
|
@@ -31,16 +31,16 @@ INLINE Token *tk_set_ts(Token *tk,
|
|
|
31
31
|
char *start, char *end, char *text, int pos_inc)
|
|
32
32
|
{
|
|
33
33
|
return tk_set(tk, start, (int)(end - start),
|
|
34
|
-
(
|
|
34
|
+
(off_t)(start - text), (off_t)(end - text), pos_inc);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
INLINE Token *tk_set_no_len(Token *tk,
|
|
38
|
-
char *text,
|
|
38
|
+
char *text, off_t start, off_t end, int pos_inc)
|
|
39
39
|
{
|
|
40
40
|
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
-
INLINE Token *w_tk_set(Token *tk, wchar_t *text,
|
|
43
|
+
INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
|
|
44
44
|
int pos_inc)
|
|
45
45
|
{
|
|
46
46
|
int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
|
|
@@ -374,8 +374,8 @@ static Token *mb_wst_next_lc(TokenStream *ts)
|
|
|
374
374
|
}
|
|
375
375
|
*w = 0;
|
|
376
376
|
ts->t = t;
|
|
377
|
-
return w_tk_set(&(CTS(ts)->token), wbuf, (
|
|
378
|
-
(
|
|
377
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
|
|
378
|
+
(off_t)(t - ts->text), 1);
|
|
379
379
|
}
|
|
380
380
|
|
|
381
381
|
TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
|
|
@@ -512,8 +512,8 @@ Token *mb_lt_next_lc(TokenStream *ts)
|
|
|
512
512
|
}
|
|
513
513
|
*w = 0;
|
|
514
514
|
ts->t = t;
|
|
515
|
-
return w_tk_set(&(CTS(ts)->token), wbuf, (
|
|
516
|
-
(
|
|
515
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
|
|
516
|
+
(off_t)(t - ts->text), 1);
|
|
517
517
|
}
|
|
518
518
|
|
|
519
519
|
TokenStream *mb_letter_tokenizer_new(bool lowercase)
|
|
@@ -926,8 +926,8 @@ static Token *std_next(TokenStream *ts)
|
|
|
926
926
|
}
|
|
927
927
|
ts->t = t + len;
|
|
928
928
|
token[len] = 0;
|
|
929
|
-
return tk_set(&(CTS(ts)->token), token, len, (
|
|
930
|
-
(
|
|
929
|
+
return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
|
|
930
|
+
(off_t)(ts->t - ts->text), 1);
|
|
931
931
|
}
|
|
932
932
|
|
|
933
933
|
/* now see how long a url we can find. */
|
|
@@ -974,8 +974,8 @@ static Token *std_next(TokenStream *ts)
|
|
|
974
974
|
}
|
|
975
975
|
}
|
|
976
976
|
tk_set(&(CTS(ts)->token), token, token_i,
|
|
977
|
-
(
|
|
978
|
-
(
|
|
977
|
+
(off_t)(start - ts->text),
|
|
978
|
+
(off_t)(t - ts->text), 1);
|
|
979
979
|
}
|
|
980
980
|
else { /* just return the url as is */
|
|
981
981
|
tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
data/ext/analysis.h
CHANGED
|
@@ -16,16 +16,16 @@ typedef struct Token
|
|
|
16
16
|
{
|
|
17
17
|
char text[MAX_WORD_SIZE];
|
|
18
18
|
int len;
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
off_t start;
|
|
20
|
+
off_t end;
|
|
21
21
|
int pos_inc;
|
|
22
22
|
} Token;
|
|
23
23
|
|
|
24
24
|
extern Token *tk_new();
|
|
25
25
|
extern void tk_destroy(void *p);
|
|
26
|
-
extern Token *tk_set(Token *tk, char *text, int tlen,
|
|
26
|
+
extern Token *tk_set(Token *tk, char *text, int tlen, off_t start, off_t end,
|
|
27
27
|
int pos_inc);
|
|
28
|
-
extern Token *tk_set_no_len(Token *tk, char *text,
|
|
28
|
+
extern Token *tk_set_no_len(Token *tk, char *text, off_t start, off_t end,
|
|
29
29
|
int pos_inc);
|
|
30
30
|
extern int tk_eq(Token *tk1, Token *tk2);
|
|
31
31
|
extern int tk_cmp(Token *tk1, Token *tk2);
|
data/ext/index.c
CHANGED
|
@@ -1450,10 +1450,12 @@ TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
|
|
1450
1450
|
if (store_offsets) {
|
|
1451
1451
|
int num_positions = tv->offset_cnt = is_read_vint(fdt_in);
|
|
1452
1452
|
Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
|
|
1453
|
-
|
|
1453
|
+
long long offset = 0;
|
|
1454
1454
|
for (i = 0; i < num_positions; i++) {
|
|
1455
|
-
offsets[i].start =
|
|
1456
|
-
|
|
1455
|
+
offsets[i].start =
|
|
1456
|
+
(off_t)(offset += (long long)is_read_vll(fdt_in));
|
|
1457
|
+
offsets[i].end =
|
|
1458
|
+
(off_t)(offset += (long long)is_read_vll(fdt_in));
|
|
1457
1459
|
}
|
|
1458
1460
|
}
|
|
1459
1461
|
}
|
|
@@ -1681,13 +1683,13 @@ void fw_add_postings(FieldsWriter *fw,
|
|
|
1681
1683
|
|
|
1682
1684
|
if (fi_store_offsets(fi)) {
|
|
1683
1685
|
/* use delta encoding for offsets */
|
|
1684
|
-
|
|
1686
|
+
long long last_end = 0;
|
|
1685
1687
|
os_write_vint(fdt_out, offset_count); /* write shared prefix length */
|
|
1686
1688
|
for (i = 0; i < offset_count; i++) {
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1689
|
+
long long start = (long long)offsets[i].start;
|
|
1690
|
+
long long end = (long long)offsets[i].end;
|
|
1691
|
+
os_write_vll(fdt_out, (unsigned long long)(start - last_end));
|
|
1692
|
+
os_write_vll(fdt_out, (unsigned long long)(end - start));
|
|
1691
1693
|
last_end = end;
|
|
1692
1694
|
}
|
|
1693
1695
|
}
|
|
@@ -5197,7 +5199,7 @@ static void dw_add_posting(MemoryPool *mp,
|
|
|
5197
5199
|
}
|
|
5198
5200
|
}
|
|
5199
5201
|
|
|
5200
|
-
static INLINE void dw_add_offsets(DocWriter *dw, int pos,
|
|
5202
|
+
static INLINE void dw_add_offsets(DocWriter *dw, int pos, off_t start, off_t end)
|
|
5201
5203
|
{
|
|
5202
5204
|
if (pos >= dw->offsets_capa) {
|
|
5203
5205
|
int old_capa = dw->offsets_capa;
|
data/ext/r_index.c
CHANGED
|
@@ -1192,8 +1192,8 @@ static VALUE
|
|
|
1192
1192
|
frt_get_tv_offsets(Offset *offset)
|
|
1193
1193
|
{
|
|
1194
1194
|
return rb_struct_new(cTVOffsets,
|
|
1195
|
-
|
|
1196
|
-
|
|
1195
|
+
ULL2NUM((unsigned long long)offset->start),
|
|
1196
|
+
ULL2NUM((unsigned long long)offset->end),
|
|
1197
1197
|
NULL);
|
|
1198
1198
|
}
|
|
1199
1199
|
|
data/ext/store.c
CHANGED
|
@@ -403,6 +403,36 @@ INLINE off_t is_read_voff_t(InStream *is)
|
|
|
403
403
|
return res;
|
|
404
404
|
}
|
|
405
405
|
|
|
406
|
+
/* optimized to use unchecked read_byte if there is definitely space */
|
|
407
|
+
INLINE unsigned long long is_read_vll(InStream *is)
|
|
408
|
+
{
|
|
409
|
+
register unsigned long long res, b;
|
|
410
|
+
register int shift = 7;
|
|
411
|
+
|
|
412
|
+
if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
|
|
413
|
+
b = is_read_byte(is);
|
|
414
|
+
res = b & 0x7F; /* 0x7F = 0b01111111 */
|
|
415
|
+
|
|
416
|
+
while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
|
|
417
|
+
b = is_read_byte(is);
|
|
418
|
+
res |= (b & 0x7F) << shift;
|
|
419
|
+
shift += 7;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
else { /* unchecked optimization */
|
|
423
|
+
b = read_byte(is);
|
|
424
|
+
res = b & 0x7F; /* 0x7F = 0b01111111 */
|
|
425
|
+
|
|
426
|
+
while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
|
|
427
|
+
b = read_byte(is);
|
|
428
|
+
res |= (b & 0x7F) << shift;
|
|
429
|
+
shift += 7;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
return res;
|
|
434
|
+
}
|
|
435
|
+
|
|
406
436
|
INLINE void is_skip_vints(InStream *is, register int cnt)
|
|
407
437
|
{
|
|
408
438
|
for (; cnt > 0; cnt--) {
|
|
@@ -545,6 +575,25 @@ INLINE void os_write_voff_t(OutStream *os, register off_t num)
|
|
|
545
575
|
}
|
|
546
576
|
}
|
|
547
577
|
|
|
578
|
+
/* optimized to use an unchecked write if there is space */
|
|
579
|
+
INLINE void os_write_vll(OutStream *os, register unsigned long long num)
|
|
580
|
+
{
|
|
581
|
+
if (os->buf.pos > VINT_END) {
|
|
582
|
+
while (num > 127) {
|
|
583
|
+
os_write_byte(os, (uchar)((num & 0x7f) | 0x80));
|
|
584
|
+
num >>= 7;
|
|
585
|
+
}
|
|
586
|
+
os_write_byte(os, (uchar)num);
|
|
587
|
+
}
|
|
588
|
+
else {
|
|
589
|
+
while (num > 127) {
|
|
590
|
+
write_byte(os, (uchar)((num & 0x7f) | 0x80));
|
|
591
|
+
num >>= 7;
|
|
592
|
+
}
|
|
593
|
+
write_byte(os, (uchar)num);
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
|
|
548
597
|
void os_write_string(OutStream *os, char *str)
|
|
549
598
|
{
|
|
550
599
|
int len = (int)strlen(str);
|
data/ext/store.h
CHANGED
|
@@ -176,7 +176,11 @@ struct Store
|
|
|
176
176
|
CompoundStore *cmpd; /* for compound_store only */
|
|
177
177
|
} dir;
|
|
178
178
|
|
|
179
|
+
#ifdef POSH_OS_WIN32
|
|
180
|
+
int file_mode;
|
|
181
|
+
#else
|
|
179
182
|
mode_t file_mode;
|
|
183
|
+
#endif
|
|
180
184
|
HashSet *locks;
|
|
181
185
|
|
|
182
186
|
/**
|
|
@@ -554,6 +558,16 @@ extern void os_write_vint(OutStream *os, register unsigned int num);
|
|
|
554
558
|
*/
|
|
555
559
|
extern void os_write_voff_t(OutStream *os, register off_t num);
|
|
556
560
|
|
|
561
|
+
/**
|
|
562
|
+
* Write an unsigned long long to OutStream in compressed VINT format.
|
|
563
|
+
* TODO: describe VINT format
|
|
564
|
+
*
|
|
565
|
+
* @param os OutStream to write to
|
|
566
|
+
* @param num the long long to write
|
|
567
|
+
* @raise IO_ERROR if there is an error writing to the file-system
|
|
568
|
+
*/
|
|
569
|
+
extern void os_write_vll(OutStream *os, register unsigned long long num);
|
|
570
|
+
|
|
557
571
|
/**
|
|
558
572
|
* Write a string to the OutStream. A string is an integer +length+ in VINT
|
|
559
573
|
* format (see os_write_vint) followed by +length+ bytes. The string can then
|
|
@@ -694,6 +708,17 @@ extern INLINE void is_skip_vints(InStream *is, register int cnt);
|
|
|
694
708
|
*/
|
|
695
709
|
extern INLINE off_t is_read_voff_t(InStream *is);
|
|
696
710
|
|
|
711
|
+
/**
|
|
712
|
+
* Read a compressed (VINT) unsigned long long from the InStream.
|
|
713
|
+
* TODO: describe VINT format
|
|
714
|
+
*
|
|
715
|
+
* @param is the InStream to read from
|
|
716
|
+
* @return a long long
|
|
717
|
+
* @raise IO_ERROR if there is a error reading from the file-system
|
|
718
|
+
* @raise EOF_ERROR if there is an attempt to read past the end of the file
|
|
719
|
+
*/
|
|
720
|
+
extern INLINE unsigned long long is_read_vll(InStream *is);
|
|
721
|
+
|
|
697
722
|
/**
|
|
698
723
|
* Read a string from the InStream. A string is an integer +length+ in vint
|
|
699
724
|
* format (see is_read_vint) followed by +length+ bytes. This is the format
|
data/lib/ferret_version.rb
CHANGED
metadata
CHANGED
|
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
|
3
3
|
specification_version: 1
|
|
4
4
|
name: ferret
|
|
5
5
|
version: !ruby/object:Gem::Version
|
|
6
|
-
version: 0.11.
|
|
7
|
-
date: 2007-11-
|
|
6
|
+
version: 0.11.6
|
|
7
|
+
date: 2007-11-29 00:00:00 +11:00
|
|
8
8
|
summary: Ruby indexing library.
|
|
9
9
|
require_paths:
|
|
10
10
|
- lib
|