isomorfeus-ferret 0.13.0 → 0.13.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +37 -17
- data/ext/isomorfeus_ferret_ext/{bzip_blocksort.c → bzlib_blocksort.c} +0 -0
- data/ext/isomorfeus_ferret_ext/{bzip_huffman.c → bzlib_huffman.c} +0 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +15 -36
- data/ext/isomorfeus_ferret_ext/frt_global.c +0 -151
- data/ext/isomorfeus_ferret_ext/frt_global.h +0 -15
- data/ext/isomorfeus_ferret_ext/frt_index.c +23 -11
- data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1 -0
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +1 -3
- data/ext/isomorfeus_ferret_ext/test.c +0 -16
- data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
- data/ext/isomorfeus_ferret_ext/test_search.c +0 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 958c051fa7f771e25ee1fb29c47aad9ab1cd7a570750f73450ae582474ef6fb5
|
4
|
+
data.tar.gz: beba6b22ba7493f324be38997fee4f4a56efc896281fcbb2f3521ebef6795029
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98bef5c79e3d4e8854d6a4cb68e2cb796c6f27bbaf1c7b7669ab895e8656d12d033651f8d8b29ac8d7504b46dba7567aa52f2ac297285ff2c6c62e52c48d0019
|
7
|
+
data.tar.gz: 165497f356ab32cdc6250b3eada43d9ec36d61ee992114bffa9c38372c7894455caeb95aa789cf025c2538c7f9e1eddafe87e66b58536424a4c8e116a123af88
|
data/README.md
CHANGED
@@ -26,7 +26,7 @@ It should work on *nixes, *nuxes, *BSDs and also works on Windows.
|
|
26
26
|
- The :store option no longer accepts :compress, compression must now be specified by the separate :compress options (see below).
|
27
27
|
- The ASCII-specific Tokenizers and Analyzers have been removed
|
28
28
|
|
29
|
-
###
|
29
|
+
### String Encoding support
|
30
30
|
|
31
31
|
#### Input strings and stored fields
|
32
32
|
|
@@ -37,12 +37,13 @@ All Ruby string encodings are supported.
|
|
37
37
|
When fields are stored, they are now stored with the encoding, so that when they are retrieved again, they
|
38
38
|
retain the original encoding with positions matching the string in its original encoding.
|
39
39
|
|
40
|
-
#### Tokens and
|
40
|
+
#### Tokens, Terms, Filters and Queries
|
41
41
|
|
42
42
|
Tokens are internally converted to UTF-8, which may change their length compared to their original encoding,
|
43
|
-
yet they retain position information according to the source in its original encoding.
|
44
|
-
|
45
|
-
|
43
|
+
yet they retain position information according to the source in its original encoding. Terms are likewise stored in UTF-8 encoding.
|
44
|
+
Queries are converted to UTF-8 encoding too.
|
45
|
+
The benefit is, that Filters, Stemmers or anything else working with Tokens and Terms only needs to support UTF-8 encoding,
|
46
|
+
greatly simplifying things and ensuring consistent query results, independent of source encoding.
|
46
47
|
|
47
48
|
### Compression
|
48
49
|
|
@@ -50,6 +51,7 @@ Compression semantics have changed, now Brotli, BZip2 and LZ4 compression codecs
|
|
50
51
|
- BZip2: slow compression, slow decompression, high compression ratio
|
51
52
|
- Brotli: slow compression, fast decrompression, high compression ratio, recommended for general purpose.
|
52
53
|
- LZ4: fast compression, fast decrompression, low compression ratio
|
54
|
+
|
53
55
|
To see performance and compression ratios `rake ferret_compression_bench` can be run from the cloned repo.
|
54
56
|
It uses data and code within the misc/ferret_vs_lucene directory.
|
55
57
|
|
@@ -96,6 +98,7 @@ Ensure your locale is set to C.UTF-8, because the internal c tests don't know ho
|
|
96
98
|
|
97
99
|
## Benchmarks
|
98
100
|
|
101
|
+
### Indexing and Searching
|
99
102
|
- clone repo
|
100
103
|
- bundle install
|
101
104
|
- rake ferret_vs_lucene
|
@@ -104,20 +107,37 @@ A recent Java JDK must be installed to compile and run lucene benchmarks.
|
|
104
107
|
|
105
108
|
Results on Linux:
|
106
109
|
```
|
107
|
-
Ferret:
|
108
|
-
Indexing
|
109
|
-
Searching took: 0.
|
110
|
-
thats
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
110
|
+
Ferret 0.13.0:
|
111
|
+
Indexing: 9.35 secs, Docs: 19043, 2035 docs/s
|
112
|
+
Searching took: 0.3133133s for 8000 queries
|
113
|
+
thats 25533 q/s
|
114
|
+
Total found: 42000
|
115
|
+
Index size: 28Mb
|
116
|
+
|
117
|
+
Lucene 9.1.0:
|
118
|
+
Indexing: 4.20 secs, Docs: 19043, 4538 docs/s
|
119
|
+
Searching took: 1.64s for 8000 queries
|
120
|
+
thats 4875 q/s
|
121
|
+
Total found: 41000
|
122
|
+
index size: 35Mb
|
123
|
+
|
124
|
+
JVM 11.0.14.1 (Ubuntu)
|
119
125
|
```
|
120
126
|
|
127
|
+
### Storing Fields with Compression, Indexing and Retrieval
|
128
|
+
- clone repo
|
129
|
+
- bundle install
|
130
|
+
- rake ferret_compression_benchmark
|
131
|
+
|
132
|
+
Results on Linux, 0.13.0:
|
133
|
+
|
134
|
+
| Compression | Index & Store | Retrieve | Index size |
|
135
|
+
|-------------|---------------|---------------|------------|
|
136
|
+
| none | 2008 docs/s | 153853 docs/s | 43 MB |
|
137
|
+
| brotli | 1726 docs/s | 58315 docs/s | 36 MB |
|
138
|
+
| bzip2 | 1438 docs/s | 15382 docs/s | 38 MB |
|
139
|
+
| lz4 | 1932 docs/s | 127100 docs/s | 41 MB |
|
140
|
+
|
121
141
|
## Future
|
122
142
|
|
123
143
|
Lots of things to do:
|
File without changes
|
File without changes
|
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
|
|
64
64
|
static ID id_field_num;
|
65
65
|
static ID id_boost;
|
66
66
|
|
67
|
+
extern rb_encoding *utf8_encoding;
|
67
68
|
extern void frb_set_term(VALUE rterm, FrtTerm *t);
|
68
69
|
extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
|
69
70
|
extern VALUE frb_get_analyzer(FrtAnalyzer *a);
|
@@ -181,8 +182,9 @@ static VALUE frb_get_field_info(FrtFieldInfo *fi) {
|
|
181
182
|
fi->rfi = TypedData_Wrap_Struct(cFieldInfo, &frb_field_info_t, fi);
|
182
183
|
FRT_REF(fi);
|
183
184
|
}
|
185
|
+
return fi->rfi;
|
184
186
|
}
|
185
|
-
return
|
187
|
+
return Qnil;
|
186
188
|
}
|
187
189
|
|
188
190
|
/*
|
@@ -411,8 +413,9 @@ static VALUE frb_get_field_infos(FrtFieldInfos *fis) {
|
|
411
413
|
fis->rfis = TypedData_Wrap_Struct(cFieldInfos, &frb_field_infos_t, fis);
|
412
414
|
FRT_REF(fis);
|
413
415
|
}
|
416
|
+
return fis->rfis;
|
414
417
|
}
|
415
|
-
return
|
418
|
+
return Qnil;
|
416
419
|
}
|
417
420
|
|
418
421
|
/*
|
@@ -496,11 +499,6 @@ static VALUE frb_fis_get(VALUE self, VALUE ridx) {
|
|
496
499
|
case T_STRING:
|
497
500
|
rfi = frb_get_field_info(frt_fis_get_field(fis, frb_field(ridx)));
|
498
501
|
break;
|
499
|
-
/*
|
500
|
-
case T_STRING:
|
501
|
-
rfi = frb_get_field_info(frt_fis_get_field(fis, StringValuePtr(ridx)));
|
502
|
-
break;
|
503
|
-
*/
|
504
502
|
default:
|
505
503
|
rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
|
506
504
|
rs2s(rb_obj_as_string(ridx)));
|
@@ -1219,6 +1217,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
|
|
1219
1217
|
VALUE rtext;
|
1220
1218
|
VALUE rpositions = Qnil;
|
1221
1219
|
rtext = rb_str_new2(tv_term->text);
|
1220
|
+
rb_enc_associate(rtext, utf8_encoding);
|
1222
1221
|
if (tv_term->positions) {
|
1223
1222
|
int *positions = tv_term->positions;
|
1224
1223
|
rpositions = rb_ary_new2(freq);
|
@@ -1697,10 +1696,9 @@ frb_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
|
|
1697
1696
|
* Get the FieldInfos object for this FrtIndexWriter. This is useful if you need
|
1698
1697
|
* to dynamically add new fields to the index with specific properties.
|
1699
1698
|
*/
|
1700
|
-
static VALUE
|
1701
|
-
|
1702
|
-
|
1703
|
-
FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
|
1699
|
+
static VALUE frb_iw_field_infos(VALUE self) {
|
1700
|
+
FrtIndexWriter *iw;
|
1701
|
+
TypedData_Get_Struct(self, FrtIndexWriter, &frb_index_writer_t, iw);
|
1704
1702
|
return frb_get_field_infos(iw->fis);
|
1705
1703
|
}
|
1706
1704
|
|
@@ -2715,10 +2713,9 @@ frb_ir_fields(VALUE self)
|
|
2715
2713
|
*
|
2716
2714
|
* Get the FieldInfos object for this IndexReader.
|
2717
2715
|
*/
|
2718
|
-
static VALUE
|
2719
|
-
|
2720
|
-
|
2721
|
-
FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
|
2716
|
+
static VALUE frb_ir_field_infos(VALUE self) {
|
2717
|
+
FrtIndexReader *ir;
|
2718
|
+
TypedData_Get_Struct(self, FrtIndexReader, &frb_index_reader_t, ir);
|
2722
2719
|
return frb_get_field_infos(ir->fis);
|
2723
2720
|
}
|
2724
2721
|
|
@@ -3085,10 +3082,6 @@ static void Init_TermDocEnum(void) {
|
|
3085
3082
|
rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
|
3086
3083
|
}
|
3087
3084
|
|
3088
|
-
/* rdochack
|
3089
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3090
|
-
*/
|
3091
|
-
|
3092
3085
|
/*
|
3093
3086
|
* Document-class: Ferret::Index::TermVector::TVOffsets
|
3094
3087
|
*
|
@@ -3107,9 +3100,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
|
3107
3100
|
*/
|
3108
3101
|
static void Init_TVOffsets(void) {
|
3109
3102
|
const char *tv_offsets_class = "TVOffsets";
|
3110
|
-
/* rdochack
|
3111
|
-
cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
|
3112
|
-
*/
|
3113
3103
|
cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
|
3114
3104
|
rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
|
3115
3105
|
rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
|
@@ -3130,13 +3120,8 @@ static void Init_TVOffsets(void) {
|
|
3130
3120
|
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
3131
3121
|
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
3132
3122
|
*/
|
3133
|
-
static void
|
3134
|
-
Init_TVTerm(void)
|
3135
|
-
{
|
3123
|
+
static void Init_TVTerm(void) {
|
3136
3124
|
const char *tv_term_class = "TVTerm";
|
3137
|
-
/* rdochack
|
3138
|
-
cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
|
3139
|
-
*/
|
3140
3125
|
cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
|
3141
3126
|
rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
|
3142
3127
|
rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
|
@@ -3172,15 +3157,9 @@ Init_TVTerm(void)
|
|
3172
3157
|
* particular that you need to store both positions and offsets if you want
|
3173
3158
|
* to associate offsets with particular terms.
|
3174
3159
|
*/
|
3175
|
-
static void
|
3176
|
-
Init_TermVector(void)
|
3177
|
-
{
|
3160
|
+
static void Init_TermVector(void) {
|
3178
3161
|
const char *tv_class = "TermVector";
|
3179
|
-
|
3180
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3181
|
-
*/
|
3182
|
-
cTermVector = rb_struct_define(tv_class,
|
3183
|
-
"field", "terms", "offsets", NULL);
|
3162
|
+
cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
|
3184
3163
|
rb_set_class_path(cTermVector, mIndex, tv_class);
|
3185
3164
|
rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
|
3186
3165
|
|
@@ -232,71 +232,6 @@ void frt_dummy_free(void *p) {
|
|
232
232
|
(void)p; /* suppress unused argument warning */
|
233
233
|
}
|
234
234
|
|
235
|
-
#ifdef HAVE_GDB
|
236
|
-
#define CMD_BUF_SIZE (128 + FILENAME_MAX)
|
237
|
-
/* need to declare this as it is masked by default in linux */
|
238
|
-
|
239
|
-
static char *build_shell_command(void) {
|
240
|
-
int pid = getpid();
|
241
|
-
char *buf = FRT_ALLOC_N(char, CMD_BUF_SIZE);
|
242
|
-
char *command =
|
243
|
-
"gdb -quiet -ex='bt' -ex='quit' %s %d 2>/dev/null | grep '^[ #]'";
|
244
|
-
|
245
|
-
snprintf(buf, CMD_BUF_SIZE, command, frt_progname(), pid);
|
246
|
-
return buf;
|
247
|
-
}
|
248
|
-
|
249
|
-
#endif
|
250
|
-
|
251
|
-
/**
|
252
|
-
* Call out to gdb to get our stacktrace.
|
253
|
-
*/
|
254
|
-
char *frt_get_stacktrace(void) {
|
255
|
-
#ifdef HAVE_GDB
|
256
|
-
FILE *stream;
|
257
|
-
char *gdb_filename = NULL, *buf = NULL, *stack = NULL;
|
258
|
-
int offset = -FRT_BUFFER_SIZE;
|
259
|
-
|
260
|
-
if ( !(buf = build_shell_command()) ) {
|
261
|
-
fprintf(EXCEPTION_STREAM,
|
262
|
-
"Unable to build stacktrace shell command\n");
|
263
|
-
goto cleanup;
|
264
|
-
}
|
265
|
-
|
266
|
-
if ( !(stream = popen(buf, "r")) ) {
|
267
|
-
fprintf(EXCEPTION_STREAM,
|
268
|
-
"Unable to exec stacktrace shell command: '%s'\n", buf);
|
269
|
-
goto cleanup;
|
270
|
-
}
|
271
|
-
|
272
|
-
do {
|
273
|
-
offset += FRT_BUFFER_SIZE;
|
274
|
-
FRT_REALLOC_N(stack, char, offset + FRT_BUFFER_SIZE);
|
275
|
-
FRT_ZEROSET_N(stack + offset, char, FRT_BUFFER_SIZE);
|
276
|
-
} while(fread(stack + offset, 1, FRT_BUFFER_SIZE, stream) == FRT_BUFFER_SIZE);
|
277
|
-
|
278
|
-
pclose(stream);
|
279
|
-
|
280
|
-
cleanup:
|
281
|
-
if (gdb_filename) free(gdb_filename);
|
282
|
-
if (buf) free(buf);
|
283
|
-
return stack;
|
284
|
-
#else
|
285
|
-
return NULL;
|
286
|
-
#endif
|
287
|
-
}
|
288
|
-
|
289
|
-
void frt_print_stacktrace(void) {
|
290
|
-
char *stack = frt_get_stacktrace();
|
291
|
-
|
292
|
-
if (stack) {
|
293
|
-
fprintf(EXCEPTION_STREAM, "Stack trace:\n%s", stack);
|
294
|
-
free(stack);
|
295
|
-
} else {
|
296
|
-
fprintf(EXCEPTION_STREAM, "Stack trace not available\n");
|
297
|
-
}
|
298
|
-
}
|
299
|
-
|
300
235
|
typedef struct FreeMe {
|
301
236
|
void *p;
|
302
237
|
frt_free_ft free_func;
|
@@ -321,55 +256,7 @@ void frt_register_for_cleanup(void *p, frt_free_ft free_func) {
|
|
321
256
|
free_me->free_func = free_func;
|
322
257
|
}
|
323
258
|
|
324
|
-
#define MAX_PROG_NAME 200
|
325
|
-
static char name[MAX_PROG_NAME]; /* program name for error msgs */
|
326
|
-
|
327
|
-
/* frt_setprogname: set stored name of program */
|
328
|
-
void frt_setprogname(const char *str) {
|
329
|
-
strncpy(name, str, sizeof(name) - 1);
|
330
|
-
}
|
331
|
-
|
332
|
-
const char *frt_progname(void) {
|
333
|
-
return name;
|
334
|
-
}
|
335
|
-
|
336
|
-
static const char *signal_to_string(int signum) {
|
337
|
-
switch (signum)
|
338
|
-
{
|
339
|
-
case SIGILL: return "SIGILL";
|
340
|
-
case SIGABRT: return "SIGABRT";
|
341
|
-
case SIGFPE: return "SIGFPE";
|
342
|
-
#if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
|
343
|
-
case SIGBUS: return "SIGBUS";
|
344
|
-
#endif
|
345
|
-
case SIGSEGV: return "SIGSEGV";
|
346
|
-
}
|
347
|
-
|
348
|
-
return "Unknown Signal";
|
349
|
-
}
|
350
|
-
|
351
|
-
static void sighandler_crash(int signum) {
|
352
|
-
frt_print_stacktrace();
|
353
|
-
FRT_XEXIT("Signal", "Exiting on signal %s (%d)", signal_to_string(signum), signum);
|
354
|
-
}
|
355
|
-
|
356
|
-
#define SETSIG_IF_UNSET(sig, handler) do { \
|
357
|
-
signal(sig, handler); \
|
358
|
-
} while(0)
|
359
|
-
|
360
259
|
void frt_init(int argc, const char *const argv[]) {
|
361
|
-
if (argc > 0) {
|
362
|
-
frt_setprogname(argv[0]);
|
363
|
-
}
|
364
|
-
|
365
|
-
SETSIG_IF_UNSET(SIGILL , sighandler_crash);
|
366
|
-
SETSIG_IF_UNSET(SIGABRT, sighandler_crash);
|
367
|
-
SETSIG_IF_UNSET(SIGFPE , sighandler_crash);
|
368
|
-
#if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
|
369
|
-
SETSIG_IF_UNSET(SIGBUS , sighandler_crash);
|
370
|
-
#endif
|
371
|
-
SETSIG_IF_UNSET(SIGSEGV, sighandler_crash);
|
372
|
-
|
373
260
|
atexit(&frt_hash_finalize);
|
374
261
|
|
375
262
|
utf8_encoding = rb_enc_find("UTF-8");
|
@@ -429,41 +316,3 @@ void frt_init(int argc, const char *const argv[]) {
|
|
429
316
|
FRT_SORT_FIELD_DOC_REV->compare = frt_sort_field_doc_compare; /* compare */
|
430
317
|
FRT_SORT_FIELD_DOC_REV->get_val = frt_sort_field_doc_get_val; /* get_val */
|
431
318
|
}
|
432
|
-
|
433
|
-
/**
|
434
|
-
* For general use when testing
|
435
|
-
*
|
436
|
-
* TODO wrap in #ifdef
|
437
|
-
*/
|
438
|
-
|
439
|
-
static bool p_switch = false;
|
440
|
-
static bool p_switch_tmp = false;
|
441
|
-
|
442
|
-
void p(const char *format, ...) {
|
443
|
-
va_list args;
|
444
|
-
|
445
|
-
if (!p_switch) return;
|
446
|
-
|
447
|
-
va_start(args, format);
|
448
|
-
vfprintf(stderr, format, args);
|
449
|
-
va_end(args);
|
450
|
-
}
|
451
|
-
|
452
|
-
void p_on(void) {
|
453
|
-
fprintf(stderr, "> > > > > STARTING PRINT\n");
|
454
|
-
p_switch = true;
|
455
|
-
}
|
456
|
-
|
457
|
-
void p_off(void) {
|
458
|
-
fprintf(stderr, "< < < < < STOPPING PRINT\n");
|
459
|
-
p_switch = false;
|
460
|
-
}
|
461
|
-
|
462
|
-
void frt_p_pause(void) {
|
463
|
-
p_switch_tmp = p_switch;
|
464
|
-
p_switch = false;
|
465
|
-
}
|
466
|
-
|
467
|
-
void frt_p_resume(void) {
|
468
|
-
p_switch = p_switch_tmp;
|
469
|
-
}
|
@@ -105,9 +105,6 @@ extern char *frt_dbl_to_s(char *buf, double num);
|
|
105
105
|
extern char *frt_strfmt(const char *fmt, ...);
|
106
106
|
extern char *frt_vstrfmt(const char *fmt, va_list args);
|
107
107
|
|
108
|
-
extern char *frt_get_stacktrace();
|
109
|
-
extern void frt_print_stacktrace();
|
110
|
-
|
111
108
|
extern void frt_register_for_cleanup(void *p, frt_free_ft free_func);
|
112
109
|
|
113
110
|
/**
|
@@ -277,18 +274,6 @@ extern bool frt_x_do_logging;
|
|
277
274
|
#endif
|
278
275
|
|
279
276
|
extern void frt_init(int arc, const char *const argv[]);
|
280
|
-
extern void frt_setprogname(const char *str);
|
281
|
-
extern const char *frt_progname();
|
282
277
|
extern void frt_micro_sleep(const int micro_seconds);
|
283
278
|
|
284
|
-
/**
|
285
|
-
* For general use during testing. Switch this on and off for print statements
|
286
|
-
* to only print when p_on is called and not after p_off is called
|
287
|
-
*/
|
288
|
-
extern void p(const char *format, ...);
|
289
|
-
extern void p_on();
|
290
|
-
extern void p_off();
|
291
|
-
extern void frt_p_pause();
|
292
|
-
extern void frt_p_resume();
|
293
|
-
|
294
279
|
#endif
|
@@ -14,6 +14,7 @@
|
|
14
14
|
#undef close
|
15
15
|
#undef read
|
16
16
|
|
17
|
+
extern rb_encoding *utf8_encoding;
|
17
18
|
extern void frt_micro_sleep(const int micro_seconds);
|
18
19
|
|
19
20
|
#define GET_LOCK(lock, name, store, err_msg) do {\
|
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
|
|
1710
1711
|
total_len = delta_start + delta_len;
|
1711
1712
|
frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
|
1712
1713
|
buffer[total_len++] = '\0';
|
1713
|
-
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
|
1714
|
-
buffer, total_len);
|
1714
|
+
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
|
1715
1715
|
|
1716
1716
|
/* read freq */
|
1717
1717
|
freq = term->freq = frt_is_read_vint(fdt_in);
|
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
|
|
1822
1822
|
*
|
1823
1823
|
****************************************************************************/
|
1824
1824
|
|
1825
|
-
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
|
1826
|
-
{
|
1825
|
+
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
|
1827
1826
|
FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
|
1828
1827
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
1829
1828
|
size_t segment_len = strlen(segment);
|
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
|
|
1844
1843
|
return fw;
|
1845
1844
|
}
|
1846
1845
|
|
1847
|
-
void frt_fw_close(FrtFieldsWriter *fw)
|
1848
|
-
{
|
1846
|
+
void frt_fw_close(FrtFieldsWriter *fw) {
|
1849
1847
|
frt_os_close(fw->fdt_out);
|
1850
1848
|
frt_os_close(fw->fdx_out);
|
1851
1849
|
frt_ram_destroy_buffer(fw->buffer);
|
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
|
2046
2044
|
frt_ramo_write_to(fw->buffer, fdt_out);
|
2047
2045
|
}
|
2048
2046
|
|
2049
|
-
void frt_fw_write_tv_index(FrtFieldsWriter *fw)
|
2050
|
-
{
|
2047
|
+
void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
|
2051
2048
|
int i;
|
2052
2049
|
const int tv_cnt = frt_ary_size(fw->tv_fields);
|
2053
2050
|
FrtOutStream *fdt_out = fw->fdt_out;
|
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
|
|
5548
5545
|
for (i = 0; i < df_size; i++) {
|
5549
5546
|
int len = df->lengths[i];
|
5550
5547
|
char *data_ptr = df->data[i];
|
5551
|
-
if (
|
5552
|
-
len
|
5553
|
-
|
5548
|
+
if (df->encodings[i] == utf8_encoding) {
|
5549
|
+
if (len >= FRT_MAX_WORD_SIZE) {
|
5550
|
+
len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
|
5551
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5552
|
+
buf[len] = '\0';
|
5553
|
+
}
|
5554
|
+
} else if (df->encodings[i] != utf8_encoding) {
|
5555
|
+
if (len >= FRT_MAX_WORD_SIZE)
|
5556
|
+
len = FRT_MAX_WORD_SIZE - 1;
|
5557
|
+
const unsigned char *sp = (unsigned char *)df->data[i];
|
5558
|
+
unsigned char *dp = (unsigned char *)&buf;
|
5559
|
+
rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
5560
|
+
assert(ec != NULL);
|
5561
|
+
rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
|
5562
|
+
rb_econv_close(ec);
|
5563
|
+
len = dp - (unsigned char *)&buf;
|
5564
|
+
buf[len] = '\0';
|
5565
|
+
data_ptr = buf;
|
5554
5566
|
}
|
5555
5567
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
|
5556
5568
|
if (store_offsets) {
|
@@ -61,7 +61,6 @@ void FRT_VEXIT(const char *err_type, const char *fmt, va_list args)
|
|
61
61
|
# endif
|
62
62
|
{
|
63
63
|
fflush(stdout);
|
64
|
-
fprintf(EXCEPTION_STREAM, "\n%s: ", frt_progname());
|
65
64
|
|
66
65
|
# ifdef FRT_HAS_VARARGS
|
67
66
|
fprintf(EXCEPTION_STREAM, "%s occurred at <%s>:%d in %s\n",
|
@@ -76,7 +75,6 @@ void FRT_VEXIT(const char *err_type, const char *fmt, va_list args)
|
|
76
75
|
}
|
77
76
|
|
78
77
|
fprintf(EXCEPTION_STREAM, "\n");
|
79
|
-
frt_print_stacktrace();
|
80
78
|
if (frt_x_abort_on_exception) {
|
81
79
|
exit(2); /* conventional value for failed execution */
|
82
80
|
}
|
@@ -1286,6 +1286,7 @@ FrtBooleanClause *frt_bc_alloc(void) {
|
|
1286
1286
|
FrtBooleanClause *frt_bc_init(FrtBooleanClause *self, FrtQuery *query, FrtBCType occur) {
|
1287
1287
|
self->ref_cnt = 1;
|
1288
1288
|
self->query = query;
|
1289
|
+
self->rbc = Qnil;
|
1289
1290
|
frt_bc_set_occur(self, occur);
|
1290
1291
|
return self;
|
1291
1292
|
}
|
@@ -279,22 +279,10 @@ static void append_to_msg_buf(const char *fmt, ...)
|
|
279
279
|
va_end(args);
|
280
280
|
}
|
281
281
|
|
282
|
-
|
283
|
-
static void Tstack(void) {
|
284
|
-
if (show_stack) {
|
285
|
-
char *stack = frt_get_stacktrace();
|
286
|
-
if (stack) {
|
287
|
-
append_to_msg_buf("\n\nStack trace:\n%s\n", stack);
|
288
|
-
free(stack);
|
289
|
-
}
|
290
|
-
}
|
291
|
-
}
|
292
|
-
|
293
282
|
static void vTmsg_nf(const char *fmt, va_list args)
|
294
283
|
{
|
295
284
|
if (verbose) {
|
296
285
|
vappend_to_msg_buf(fmt, args);
|
297
|
-
Tstack();
|
298
286
|
}
|
299
287
|
}
|
300
288
|
|
@@ -305,8 +293,6 @@ void vTmsg(const char *fmt, va_list args)
|
|
305
293
|
vappend_to_msg_buf(fmt, args);
|
306
294
|
va_end(args);
|
307
295
|
append_to_msg_buf("\n");
|
308
|
-
|
309
|
-
Tstack();
|
310
296
|
}
|
311
297
|
}
|
312
298
|
|
@@ -348,8 +334,6 @@ void tst_msg(const char *func, const char *fname, int line_num, const char *fmt,
|
|
348
334
|
va_start(args, fmt);
|
349
335
|
vappend_to_msg_buf(fmt, args);
|
350
336
|
va_end(args);
|
351
|
-
|
352
|
-
Tstack();
|
353
337
|
}
|
354
338
|
}
|
355
339
|
|
@@ -145,50 +145,6 @@ static void test_dbl_to_s(TestCase *tc, void *data)
|
|
145
145
|
Asequal("NaN", frt_dbl_to_s(buf, NAN));
|
146
146
|
}
|
147
147
|
|
148
|
-
|
149
|
-
/**
|
150
|
-
* Generate a stacktrace, make sure it does something
|
151
|
-
*/
|
152
|
-
static void test_stacktrace(TestCase *tc, void *data)
|
153
|
-
{
|
154
|
-
FILE *old_stream = frt_x_exception_stream;
|
155
|
-
(void)data; /* suppress warning */
|
156
|
-
int tfd = fio_tmpfile();
|
157
|
-
frt_x_exception_stream = fdopen(tfd, "w+");
|
158
|
-
Atrue(frt_x_exception_stream != NULL);
|
159
|
-
if (frt_x_exception_stream) {
|
160
|
-
frt_print_stacktrace();
|
161
|
-
long int f = ftell(frt_x_exception_stream);
|
162
|
-
Assert(f, "Stream position should not be 0");
|
163
|
-
fclose(frt_x_exception_stream);
|
164
|
-
}
|
165
|
-
frt_x_exception_stream = old_stream;
|
166
|
-
}
|
167
|
-
|
168
|
-
/**
|
169
|
-
* Generate a normally fatal signal, which gets caught
|
170
|
-
*/
|
171
|
-
/*
|
172
|
-
static void test_sighandler(TestCase *tc, void *data)
|
173
|
-
{
|
174
|
-
bool old_abort = frt_x_abort_on_exception;
|
175
|
-
FILE *old_stream = frt_x_exception_stream;
|
176
|
-
(void)data;
|
177
|
-
(void)tc;
|
178
|
-
|
179
|
-
frt_x_exception_stream = false;
|
180
|
-
frt_x_exception_stream = tmpfile();
|
181
|
-
|
182
|
-
raise(SIGSEGV);
|
183
|
-
|
184
|
-
Assert(ftell(frt_x_exception_stream), "Stream position should not be 0");
|
185
|
-
fclose(frt_x_exception_stream);
|
186
|
-
|
187
|
-
frt_x_exception_stream = old_stream;
|
188
|
-
frt_x_abort_on_exception = old_abort;
|
189
|
-
}
|
190
|
-
*/
|
191
|
-
|
192
148
|
static void test_count_leading_zeros(TestCase *tc, void *data)
|
193
149
|
{
|
194
150
|
(void)data;
|
@@ -284,8 +240,6 @@ TestSuite *ts_global(TestSuite *suite)
|
|
284
240
|
tst_run_test(suite, test_alloc, NULL);
|
285
241
|
tst_run_test(suite, test_strfmt, NULL);
|
286
242
|
tst_run_test(suite, test_dbl_to_s, NULL);
|
287
|
-
tst_run_test(suite, test_stacktrace, NULL);
|
288
|
-
// tst_run_test(suite, test_sighandler, NULL);
|
289
243
|
tst_run_test(suite, test_count_leading_zeros, NULL);
|
290
244
|
tst_run_test(suite, test_count_leading_ones, NULL);
|
291
245
|
tst_run_test(suite, test_count_trailing_zeros, NULL);
|
@@ -266,7 +266,6 @@ void tst_check_hits(TestCase *tc, FrtSearcher *searcher, FrtQuery *query, const
|
|
266
266
|
int i, count;
|
267
267
|
int total_hits = s2l(expected_hits, num_array);
|
268
268
|
FrtTopDocs *top_docs = frt_searcher_search(searcher, query, 0, total_hits + 1, NULL, NULL, NULL);
|
269
|
-
frt_p_pause();
|
270
269
|
if (!tc->failed && !Aiequal(total_hits, top_docs->total_hits)) {
|
271
270
|
int i;
|
272
271
|
Tmsg_nf("\texpected docs:\n\t ");
|
@@ -314,7 +313,6 @@ void tst_check_hits(TestCase *tc, FrtSearcher *searcher, FrtQuery *query, const
|
|
314
313
|
count = frt_searcher_search_unscored(searcher, query, num_array2, ARRAY_SIZE, num_array2[3]);
|
315
314
|
Aaiequal(num_array + 3, num_array2, count);
|
316
315
|
}
|
317
|
-
frt_p_resume();
|
318
316
|
}
|
319
317
|
|
320
318
|
void check_match_vector(TestCase *tc, FrtSearcher *searcher, FrtQuery *query,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isomorfeus-ferret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Biedermann
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -154,13 +154,13 @@ files:
|
|
154
154
|
- ext/isomorfeus_ferret_ext/brotli_encode.h
|
155
155
|
- ext/isomorfeus_ferret_ext/brotli_port.h
|
156
156
|
- ext/isomorfeus_ferret_ext/brotli_types.h
|
157
|
-
- ext/isomorfeus_ferret_ext/bzip_blocksort.c
|
158
|
-
- ext/isomorfeus_ferret_ext/bzip_huffman.c
|
159
157
|
- ext/isomorfeus_ferret_ext/bzlib.c
|
160
158
|
- ext/isomorfeus_ferret_ext/bzlib.h
|
159
|
+
- ext/isomorfeus_ferret_ext/bzlib_blocksort.c
|
161
160
|
- ext/isomorfeus_ferret_ext/bzlib_compress.c
|
162
161
|
- ext/isomorfeus_ferret_ext/bzlib_crctable.c
|
163
162
|
- ext/isomorfeus_ferret_ext/bzlib_decompress.c
|
163
|
+
- ext/isomorfeus_ferret_ext/bzlib_huffman.c
|
164
164
|
- ext/isomorfeus_ferret_ext/bzlib_private.h
|
165
165
|
- ext/isomorfeus_ferret_ext/bzlib_randtable.c
|
166
166
|
- ext/isomorfeus_ferret_ext/extconf.rb
|