ferret 0.11.5 → 0.11.6
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/analysis.c +12 -12
- data/ext/analysis.h +4 -4
- data/ext/index.c +11 -9
- data/ext/r_index.c +2 -2
- data/ext/store.c +49 -0
- data/ext/store.h +25 -0
- data/lib/ferret_version.rb +1 -1
- metadata +2 -2
data/ext/analysis.c
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
****************************************************************************/
|
14
14
|
|
15
15
|
INLINE Token *tk_set(Token *tk,
|
16
|
-
char *text, int tlen,
|
16
|
+
char *text, int tlen, off_t start, off_t end, int pos_inc)
|
17
17
|
{
|
18
18
|
if (tlen >= MAX_WORD_SIZE) {
|
19
19
|
tlen = MAX_WORD_SIZE - 1;
|
@@ -31,16 +31,16 @@ INLINE Token *tk_set_ts(Token *tk,
|
|
31
31
|
char *start, char *end, char *text, int pos_inc)
|
32
32
|
{
|
33
33
|
return tk_set(tk, start, (int)(end - start),
|
34
|
-
(
|
34
|
+
(off_t)(start - text), (off_t)(end - text), pos_inc);
|
35
35
|
}
|
36
36
|
|
37
37
|
INLINE Token *tk_set_no_len(Token *tk,
|
38
|
-
char *text,
|
38
|
+
char *text, off_t start, off_t end, int pos_inc)
|
39
39
|
{
|
40
40
|
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
41
41
|
}
|
42
42
|
|
43
|
-
INLINE Token *w_tk_set(Token *tk, wchar_t *text,
|
43
|
+
INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
|
44
44
|
int pos_inc)
|
45
45
|
{
|
46
46
|
int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
|
@@ -374,8 +374,8 @@ static Token *mb_wst_next_lc(TokenStream *ts)
|
|
374
374
|
}
|
375
375
|
*w = 0;
|
376
376
|
ts->t = t;
|
377
|
-
return w_tk_set(&(CTS(ts)->token), wbuf, (
|
378
|
-
(
|
377
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
|
378
|
+
(off_t)(t - ts->text), 1);
|
379
379
|
}
|
380
380
|
|
381
381
|
TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
|
@@ -512,8 +512,8 @@ Token *mb_lt_next_lc(TokenStream *ts)
|
|
512
512
|
}
|
513
513
|
*w = 0;
|
514
514
|
ts->t = t;
|
515
|
-
return w_tk_set(&(CTS(ts)->token), wbuf, (
|
516
|
-
(
|
515
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
|
516
|
+
(off_t)(t - ts->text), 1);
|
517
517
|
}
|
518
518
|
|
519
519
|
TokenStream *mb_letter_tokenizer_new(bool lowercase)
|
@@ -926,8 +926,8 @@ static Token *std_next(TokenStream *ts)
|
|
926
926
|
}
|
927
927
|
ts->t = t + len;
|
928
928
|
token[len] = 0;
|
929
|
-
return tk_set(&(CTS(ts)->token), token, len, (
|
930
|
-
(
|
929
|
+
return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
|
930
|
+
(off_t)(ts->t - ts->text), 1);
|
931
931
|
}
|
932
932
|
|
933
933
|
/* now see how long a url we can find. */
|
@@ -974,8 +974,8 @@ static Token *std_next(TokenStream *ts)
|
|
974
974
|
}
|
975
975
|
}
|
976
976
|
tk_set(&(CTS(ts)->token), token, token_i,
|
977
|
-
(
|
978
|
-
(
|
977
|
+
(off_t)(start - ts->text),
|
978
|
+
(off_t)(t - ts->text), 1);
|
979
979
|
}
|
980
980
|
else { /* just return the url as is */
|
981
981
|
tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
data/ext/analysis.h
CHANGED
@@ -16,16 +16,16 @@ typedef struct Token
|
|
16
16
|
{
|
17
17
|
char text[MAX_WORD_SIZE];
|
18
18
|
int len;
|
19
|
-
|
20
|
-
|
19
|
+
off_t start;
|
20
|
+
off_t end;
|
21
21
|
int pos_inc;
|
22
22
|
} Token;
|
23
23
|
|
24
24
|
extern Token *tk_new();
|
25
25
|
extern void tk_destroy(void *p);
|
26
|
-
extern Token *tk_set(Token *tk, char *text, int tlen,
|
26
|
+
extern Token *tk_set(Token *tk, char *text, int tlen, off_t start, off_t end,
|
27
27
|
int pos_inc);
|
28
|
-
extern Token *tk_set_no_len(Token *tk, char *text,
|
28
|
+
extern Token *tk_set_no_len(Token *tk, char *text, off_t start, off_t end,
|
29
29
|
int pos_inc);
|
30
30
|
extern int tk_eq(Token *tk1, Token *tk2);
|
31
31
|
extern int tk_cmp(Token *tk1, Token *tk2);
|
data/ext/index.c
CHANGED
@@ -1450,10 +1450,12 @@ TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
|
1450
1450
|
if (store_offsets) {
|
1451
1451
|
int num_positions = tv->offset_cnt = is_read_vint(fdt_in);
|
1452
1452
|
Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
|
1453
|
-
|
1453
|
+
long long offset = 0;
|
1454
1454
|
for (i = 0; i < num_positions; i++) {
|
1455
|
-
offsets[i].start =
|
1456
|
-
|
1455
|
+
offsets[i].start =
|
1456
|
+
(off_t)(offset += (long long)is_read_vll(fdt_in));
|
1457
|
+
offsets[i].end =
|
1458
|
+
(off_t)(offset += (long long)is_read_vll(fdt_in));
|
1457
1459
|
}
|
1458
1460
|
}
|
1459
1461
|
}
|
@@ -1681,13 +1683,13 @@ void fw_add_postings(FieldsWriter *fw,
|
|
1681
1683
|
|
1682
1684
|
if (fi_store_offsets(fi)) {
|
1683
1685
|
/* use delta encoding for offsets */
|
1684
|
-
|
1686
|
+
long long last_end = 0;
|
1685
1687
|
os_write_vint(fdt_out, offset_count); /* write shared prefix length */
|
1686
1688
|
for (i = 0; i < offset_count; i++) {
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1689
|
+
long long start = (long long)offsets[i].start;
|
1690
|
+
long long end = (long long)offsets[i].end;
|
1691
|
+
os_write_vll(fdt_out, (unsigned long long)(start - last_end));
|
1692
|
+
os_write_vll(fdt_out, (unsigned long long)(end - start));
|
1691
1693
|
last_end = end;
|
1692
1694
|
}
|
1693
1695
|
}
|
@@ -5197,7 +5199,7 @@ static void dw_add_posting(MemoryPool *mp,
|
|
5197
5199
|
}
|
5198
5200
|
}
|
5199
5201
|
|
5200
|
-
static INLINE void dw_add_offsets(DocWriter *dw, int pos,
|
5202
|
+
static INLINE void dw_add_offsets(DocWriter *dw, int pos, off_t start, off_t end)
|
5201
5203
|
{
|
5202
5204
|
if (pos >= dw->offsets_capa) {
|
5203
5205
|
int old_capa = dw->offsets_capa;
|
data/ext/r_index.c
CHANGED
@@ -1192,8 +1192,8 @@ static VALUE
|
|
1192
1192
|
frt_get_tv_offsets(Offset *offset)
|
1193
1193
|
{
|
1194
1194
|
return rb_struct_new(cTVOffsets,
|
1195
|
-
|
1196
|
-
|
1195
|
+
ULL2NUM((unsigned long long)offset->start),
|
1196
|
+
ULL2NUM((unsigned long long)offset->end),
|
1197
1197
|
NULL);
|
1198
1198
|
}
|
1199
1199
|
|
data/ext/store.c
CHANGED
@@ -403,6 +403,36 @@ INLINE off_t is_read_voff_t(InStream *is)
|
|
403
403
|
return res;
|
404
404
|
}
|
405
405
|
|
406
|
+
/* optimized to use unchecked read_byte if there is definitely space */
|
407
|
+
INLINE unsigned long long is_read_vll(InStream *is)
|
408
|
+
{
|
409
|
+
register unsigned long long res, b;
|
410
|
+
register int shift = 7;
|
411
|
+
|
412
|
+
if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
|
413
|
+
b = is_read_byte(is);
|
414
|
+
res = b & 0x7F; /* 0x7F = 0b01111111 */
|
415
|
+
|
416
|
+
while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
|
417
|
+
b = is_read_byte(is);
|
418
|
+
res |= (b & 0x7F) << shift;
|
419
|
+
shift += 7;
|
420
|
+
}
|
421
|
+
}
|
422
|
+
else { /* unchecked optimization */
|
423
|
+
b = read_byte(is);
|
424
|
+
res = b & 0x7F; /* 0x7F = 0b01111111 */
|
425
|
+
|
426
|
+
while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
|
427
|
+
b = read_byte(is);
|
428
|
+
res |= (b & 0x7F) << shift;
|
429
|
+
shift += 7;
|
430
|
+
}
|
431
|
+
}
|
432
|
+
|
433
|
+
return res;
|
434
|
+
}
|
435
|
+
|
406
436
|
INLINE void is_skip_vints(InStream *is, register int cnt)
|
407
437
|
{
|
408
438
|
for (; cnt > 0; cnt--) {
|
@@ -545,6 +575,25 @@ INLINE void os_write_voff_t(OutStream *os, register off_t num)
|
|
545
575
|
}
|
546
576
|
}
|
547
577
|
|
578
|
+
/* optimized to use an unchecked write if there is space */
|
579
|
+
INLINE void os_write_vll(OutStream *os, register unsigned long long num)
|
580
|
+
{
|
581
|
+
if (os->buf.pos > VINT_END) {
|
582
|
+
while (num > 127) {
|
583
|
+
os_write_byte(os, (uchar)((num & 0x7f) | 0x80));
|
584
|
+
num >>= 7;
|
585
|
+
}
|
586
|
+
os_write_byte(os, (uchar)num);
|
587
|
+
}
|
588
|
+
else {
|
589
|
+
while (num > 127) {
|
590
|
+
write_byte(os, (uchar)((num & 0x7f) | 0x80));
|
591
|
+
num >>= 7;
|
592
|
+
}
|
593
|
+
write_byte(os, (uchar)num);
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
548
597
|
void os_write_string(OutStream *os, char *str)
|
549
598
|
{
|
550
599
|
int len = (int)strlen(str);
|
data/ext/store.h
CHANGED
@@ -176,7 +176,11 @@ struct Store
|
|
176
176
|
CompoundStore *cmpd; /* for compound_store only */
|
177
177
|
} dir;
|
178
178
|
|
179
|
+
#ifdef POSH_OS_WIN32
|
180
|
+
int file_mode;
|
181
|
+
#else
|
179
182
|
mode_t file_mode;
|
183
|
+
#endif
|
180
184
|
HashSet *locks;
|
181
185
|
|
182
186
|
/**
|
@@ -554,6 +558,16 @@ extern void os_write_vint(OutStream *os, register unsigned int num);
|
|
554
558
|
*/
|
555
559
|
extern void os_write_voff_t(OutStream *os, register off_t num);
|
556
560
|
|
561
|
+
/**
|
562
|
+
* Write an unsigned long long to OutStream in compressed VINT format.
|
563
|
+
* TODO: describe VINT format
|
564
|
+
*
|
565
|
+
* @param os OutStream to write to
|
566
|
+
* @param num the long long to write
|
567
|
+
* @raise IO_ERROR if there is an error writing to the file-system
|
568
|
+
*/
|
569
|
+
extern void os_write_vll(OutStream *os, register unsigned long long num);
|
570
|
+
|
557
571
|
/**
|
558
572
|
* Write a string to the OutStream. A string is an integer +length+ in VINT
|
559
573
|
* format (see os_write_vint) followed by +length+ bytes. The string can then
|
@@ -694,6 +708,17 @@ extern INLINE void is_skip_vints(InStream *is, register int cnt);
|
|
694
708
|
*/
|
695
709
|
extern INLINE off_t is_read_voff_t(InStream *is);
|
696
710
|
|
711
|
+
/**
|
712
|
+
* Read a compressed (VINT) unsigned long long from the InStream.
|
713
|
+
* TODO: describe VINT format
|
714
|
+
*
|
715
|
+
* @param is the InStream to read from
|
716
|
+
* @return a long long
|
717
|
+
* @raise IO_ERROR if there is a error reading from the file-system
|
718
|
+
* @raise EOF_ERROR if there is an attempt to read past the end of the file
|
719
|
+
*/
|
720
|
+
extern INLINE unsigned long long is_read_vll(InStream *is);
|
721
|
+
|
697
722
|
/**
|
698
723
|
* Read a string from the InStream. A string is an integer +length+ in vint
|
699
724
|
* format (see is_read_vint) followed by +length+ bytes. This is the format
|
data/lib/ferret_version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ferret
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.11.
|
7
|
-
date: 2007-11-
|
6
|
+
version: 0.11.6
|
7
|
+
date: 2007-11-29 00:00:00 +11:00
|
8
8
|
summary: Ruby indexing library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|