isomorfeus-ferret 0.13.0 → 0.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +37 -17
- data/ext/isomorfeus_ferret_ext/{bzip_blocksort.c → bzlib_blocksort.c} +0 -0
- data/ext/isomorfeus_ferret_ext/{bzip_huffman.c → bzlib_huffman.c} +0 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +15 -36
- data/ext/isomorfeus_ferret_ext/frt_global.c +0 -151
- data/ext/isomorfeus_ferret_ext/frt_global.h +0 -15
- data/ext/isomorfeus_ferret_ext/frt_index.c +23 -11
- data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1 -0
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +1 -3
- data/ext/isomorfeus_ferret_ext/test.c +0 -16
- data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
- data/ext/isomorfeus_ferret_ext/test_search.c +0 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 958c051fa7f771e25ee1fb29c47aad9ab1cd7a570750f73450ae582474ef6fb5
|
4
|
+
data.tar.gz: beba6b22ba7493f324be38997fee4f4a56efc896281fcbb2f3521ebef6795029
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98bef5c79e3d4e8854d6a4cb68e2cb796c6f27bbaf1c7b7669ab895e8656d12d033651f8d8b29ac8d7504b46dba7567aa52f2ac297285ff2c6c62e52c48d0019
|
7
|
+
data.tar.gz: 165497f356ab32cdc6250b3eada43d9ec36d61ee992114bffa9c38372c7894455caeb95aa789cf025c2538c7f9e1eddafe87e66b58536424a4c8e116a123af88
|
data/README.md
CHANGED
@@ -26,7 +26,7 @@ It should work on *nixes, *nuxes, *BSDs and also works on Windows.
|
|
26
26
|
- The :store option no longer accepts :compress, compression must now be specified by the separate :compress options (see below).
|
27
27
|
- The ASCII-specific Tokenizers and Analyzers have been removed
|
28
28
|
|
29
|
-
###
|
29
|
+
### String Encoding support
|
30
30
|
|
31
31
|
#### Input strings and stored fields
|
32
32
|
|
@@ -37,12 +37,13 @@ All Ruby string encodings are supported.
|
|
37
37
|
When fields are stored, they are now stored with the encoding, so that when they are retrieved again, they
|
38
38
|
retain the original encoding with positions matching the string in its original encoding.
|
39
39
|
|
40
|
-
#### Tokens and
|
40
|
+
#### Tokens, Terms, Filters and Queries
|
41
41
|
|
42
42
|
Tokens are internally converted to UTF-8, which may change their length compared to their original encoding,
|
43
|
-
yet they retain position information according to the source in its original encoding.
|
44
|
-
|
45
|
-
|
43
|
+
yet they retain position information according to the source in its original encoding. Terms are likewise stored in UTF-8 encoding.
|
44
|
+
Queries are converted to UTF-8 encoding too.
|
45
|
+
The benefit is, that Filters, Stemmers or anything else working with Tokens and Terms only needs to support UTF-8 encoding,
|
46
|
+
greatly simplifying things and ensuring consistent query results, independent of source encoding.
|
46
47
|
|
47
48
|
### Compression
|
48
49
|
|
@@ -50,6 +51,7 @@ Compression semantics have changed, now Brotli, BZip2 and LZ4 compression codecs
|
|
50
51
|
- BZip2: slow compression, slow decompression, high compression ratio
|
51
52
|
- Brotli: slow compression, fast decrompression, high compression ratio, recommended for general purpose.
|
52
53
|
- LZ4: fast compression, fast decrompression, low compression ratio
|
54
|
+
|
53
55
|
To see performance and compression ratios `rake ferret_compression_bench` can be run from the cloned repo.
|
54
56
|
It uses data and code within the misc/ferret_vs_lucene directory.
|
55
57
|
|
@@ -96,6 +98,7 @@ Ensure your locale is set to C.UTF-8, because the internal c tests don't know ho
|
|
96
98
|
|
97
99
|
## Benchmarks
|
98
100
|
|
101
|
+
### Indexing and Searching
|
99
102
|
- clone repo
|
100
103
|
- bundle install
|
101
104
|
- rake ferret_vs_lucene
|
@@ -104,20 +107,37 @@ A recent Java JDK must be installed to compile and run lucene benchmarks.
|
|
104
107
|
|
105
108
|
Results on Linux:
|
106
109
|
```
|
107
|
-
Ferret:
|
108
|
-
Indexing
|
109
|
-
Searching took: 0.
|
110
|
-
thats
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
110
|
+
Ferret 0.13.0:
|
111
|
+
Indexing: 9.35 secs, Docs: 19043, 2035 docs/s
|
112
|
+
Searching took: 0.3133133s for 8000 queries
|
113
|
+
thats 25533 q/s
|
114
|
+
Total found: 42000
|
115
|
+
Index size: 28Mb
|
116
|
+
|
117
|
+
Lucene 9.1.0:
|
118
|
+
Indexing: 4.20 secs, Docs: 19043, 4538 docs/s
|
119
|
+
Searching took: 1.64s for 8000 queries
|
120
|
+
thats 4875 q/s
|
121
|
+
Total found: 41000
|
122
|
+
index size: 35Mb
|
123
|
+
|
124
|
+
JVM 11.0.14.1 (Ubuntu)
|
119
125
|
```
|
120
126
|
|
127
|
+
### Storing Fields with Compression, Indexing and Retrieval
|
128
|
+
- clone repo
|
129
|
+
- bundle install
|
130
|
+
- rake ferret_compression_benchmark
|
131
|
+
|
132
|
+
Results on Linux, 0.13.0:
|
133
|
+
|
134
|
+
| Compression | Index & Store | Retrieve | Index size |
|
135
|
+
|-------------|---------------|---------------|------------|
|
136
|
+
| none | 2008 docs/s | 153853 docs/s | 43 MB |
|
137
|
+
| brotli | 1726 docs/s | 58315 docs/s | 36 MB |
|
138
|
+
| bzip2 | 1438 docs/s | 15382 docs/s | 38 MB |
|
139
|
+
| lz4 | 1932 docs/s | 127100 docs/s | 41 MB |
|
140
|
+
|
121
141
|
## Future
|
122
142
|
|
123
143
|
Lots of things to do:
|
File without changes
|
File without changes
|
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
|
|
64
64
|
static ID id_field_num;
|
65
65
|
static ID id_boost;
|
66
66
|
|
67
|
+
extern rb_encoding *utf8_encoding;
|
67
68
|
extern void frb_set_term(VALUE rterm, FrtTerm *t);
|
68
69
|
extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
|
69
70
|
extern VALUE frb_get_analyzer(FrtAnalyzer *a);
|
@@ -181,8 +182,9 @@ static VALUE frb_get_field_info(FrtFieldInfo *fi) {
|
|
181
182
|
fi->rfi = TypedData_Wrap_Struct(cFieldInfo, &frb_field_info_t, fi);
|
182
183
|
FRT_REF(fi);
|
183
184
|
}
|
185
|
+
return fi->rfi;
|
184
186
|
}
|
185
|
-
return
|
187
|
+
return Qnil;
|
186
188
|
}
|
187
189
|
|
188
190
|
/*
|
@@ -411,8 +413,9 @@ static VALUE frb_get_field_infos(FrtFieldInfos *fis) {
|
|
411
413
|
fis->rfis = TypedData_Wrap_Struct(cFieldInfos, &frb_field_infos_t, fis);
|
412
414
|
FRT_REF(fis);
|
413
415
|
}
|
416
|
+
return fis->rfis;
|
414
417
|
}
|
415
|
-
return
|
418
|
+
return Qnil;
|
416
419
|
}
|
417
420
|
|
418
421
|
/*
|
@@ -496,11 +499,6 @@ static VALUE frb_fis_get(VALUE self, VALUE ridx) {
|
|
496
499
|
case T_STRING:
|
497
500
|
rfi = frb_get_field_info(frt_fis_get_field(fis, frb_field(ridx)));
|
498
501
|
break;
|
499
|
-
/*
|
500
|
-
case T_STRING:
|
501
|
-
rfi = frb_get_field_info(frt_fis_get_field(fis, StringValuePtr(ridx)));
|
502
|
-
break;
|
503
|
-
*/
|
504
502
|
default:
|
505
503
|
rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
|
506
504
|
rs2s(rb_obj_as_string(ridx)));
|
@@ -1219,6 +1217,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
|
|
1219
1217
|
VALUE rtext;
|
1220
1218
|
VALUE rpositions = Qnil;
|
1221
1219
|
rtext = rb_str_new2(tv_term->text);
|
1220
|
+
rb_enc_associate(rtext, utf8_encoding);
|
1222
1221
|
if (tv_term->positions) {
|
1223
1222
|
int *positions = tv_term->positions;
|
1224
1223
|
rpositions = rb_ary_new2(freq);
|
@@ -1697,10 +1696,9 @@ frb_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
|
|
1697
1696
|
* Get the FieldInfos object for this FrtIndexWriter. This is useful if you need
|
1698
1697
|
* to dynamically add new fields to the index with specific properties.
|
1699
1698
|
*/
|
1700
|
-
static VALUE
|
1701
|
-
|
1702
|
-
|
1703
|
-
FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
|
1699
|
+
static VALUE frb_iw_field_infos(VALUE self) {
|
1700
|
+
FrtIndexWriter *iw;
|
1701
|
+
TypedData_Get_Struct(self, FrtIndexWriter, &frb_index_writer_t, iw);
|
1704
1702
|
return frb_get_field_infos(iw->fis);
|
1705
1703
|
}
|
1706
1704
|
|
@@ -2715,10 +2713,9 @@ frb_ir_fields(VALUE self)
|
|
2715
2713
|
*
|
2716
2714
|
* Get the FieldInfos object for this IndexReader.
|
2717
2715
|
*/
|
2718
|
-
static VALUE
|
2719
|
-
|
2720
|
-
|
2721
|
-
FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
|
2716
|
+
static VALUE frb_ir_field_infos(VALUE self) {
|
2717
|
+
FrtIndexReader *ir;
|
2718
|
+
TypedData_Get_Struct(self, FrtIndexReader, &frb_index_reader_t, ir);
|
2722
2719
|
return frb_get_field_infos(ir->fis);
|
2723
2720
|
}
|
2724
2721
|
|
@@ -3085,10 +3082,6 @@ static void Init_TermDocEnum(void) {
|
|
3085
3082
|
rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
|
3086
3083
|
}
|
3087
3084
|
|
3088
|
-
/* rdochack
|
3089
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3090
|
-
*/
|
3091
|
-
|
3092
3085
|
/*
|
3093
3086
|
* Document-class: Ferret::Index::TermVector::TVOffsets
|
3094
3087
|
*
|
@@ -3107,9 +3100,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
|
3107
3100
|
*/
|
3108
3101
|
static void Init_TVOffsets(void) {
|
3109
3102
|
const char *tv_offsets_class = "TVOffsets";
|
3110
|
-
/* rdochack
|
3111
|
-
cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
|
3112
|
-
*/
|
3113
3103
|
cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
|
3114
3104
|
rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
|
3115
3105
|
rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
|
@@ -3130,13 +3120,8 @@ static void Init_TVOffsets(void) {
|
|
3130
3120
|
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
3131
3121
|
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
3132
3122
|
*/
|
3133
|
-
static void
|
3134
|
-
Init_TVTerm(void)
|
3135
|
-
{
|
3123
|
+
static void Init_TVTerm(void) {
|
3136
3124
|
const char *tv_term_class = "TVTerm";
|
3137
|
-
/* rdochack
|
3138
|
-
cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
|
3139
|
-
*/
|
3140
3125
|
cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
|
3141
3126
|
rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
|
3142
3127
|
rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
|
@@ -3172,15 +3157,9 @@ Init_TVTerm(void)
|
|
3172
3157
|
* particular that you need to store both positions and offsets if you want
|
3173
3158
|
* to associate offsets with particular terms.
|
3174
3159
|
*/
|
3175
|
-
static void
|
3176
|
-
Init_TermVector(void)
|
3177
|
-
{
|
3160
|
+
static void Init_TermVector(void) {
|
3178
3161
|
const char *tv_class = "TermVector";
|
3179
|
-
|
3180
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3181
|
-
*/
|
3182
|
-
cTermVector = rb_struct_define(tv_class,
|
3183
|
-
"field", "terms", "offsets", NULL);
|
3162
|
+
cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
|
3184
3163
|
rb_set_class_path(cTermVector, mIndex, tv_class);
|
3185
3164
|
rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
|
3186
3165
|
|
@@ -232,71 +232,6 @@ void frt_dummy_free(void *p) {
|
|
232
232
|
(void)p; /* suppress unused argument warning */
|
233
233
|
}
|
234
234
|
|
235
|
-
#ifdef HAVE_GDB
|
236
|
-
#define CMD_BUF_SIZE (128 + FILENAME_MAX)
|
237
|
-
/* need to declare this as it is masked by default in linux */
|
238
|
-
|
239
|
-
static char *build_shell_command(void) {
|
240
|
-
int pid = getpid();
|
241
|
-
char *buf = FRT_ALLOC_N(char, CMD_BUF_SIZE);
|
242
|
-
char *command =
|
243
|
-
"gdb -quiet -ex='bt' -ex='quit' %s %d 2>/dev/null | grep '^[ #]'";
|
244
|
-
|
245
|
-
snprintf(buf, CMD_BUF_SIZE, command, frt_progname(), pid);
|
246
|
-
return buf;
|
247
|
-
}
|
248
|
-
|
249
|
-
#endif
|
250
|
-
|
251
|
-
/**
|
252
|
-
* Call out to gdb to get our stacktrace.
|
253
|
-
*/
|
254
|
-
char *frt_get_stacktrace(void) {
|
255
|
-
#ifdef HAVE_GDB
|
256
|
-
FILE *stream;
|
257
|
-
char *gdb_filename = NULL, *buf = NULL, *stack = NULL;
|
258
|
-
int offset = -FRT_BUFFER_SIZE;
|
259
|
-
|
260
|
-
if ( !(buf = build_shell_command()) ) {
|
261
|
-
fprintf(EXCEPTION_STREAM,
|
262
|
-
"Unable to build stacktrace shell command\n");
|
263
|
-
goto cleanup;
|
264
|
-
}
|
265
|
-
|
266
|
-
if ( !(stream = popen(buf, "r")) ) {
|
267
|
-
fprintf(EXCEPTION_STREAM,
|
268
|
-
"Unable to exec stacktrace shell command: '%s'\n", buf);
|
269
|
-
goto cleanup;
|
270
|
-
}
|
271
|
-
|
272
|
-
do {
|
273
|
-
offset += FRT_BUFFER_SIZE;
|
274
|
-
FRT_REALLOC_N(stack, char, offset + FRT_BUFFER_SIZE);
|
275
|
-
FRT_ZEROSET_N(stack + offset, char, FRT_BUFFER_SIZE);
|
276
|
-
} while(fread(stack + offset, 1, FRT_BUFFER_SIZE, stream) == FRT_BUFFER_SIZE);
|
277
|
-
|
278
|
-
pclose(stream);
|
279
|
-
|
280
|
-
cleanup:
|
281
|
-
if (gdb_filename) free(gdb_filename);
|
282
|
-
if (buf) free(buf);
|
283
|
-
return stack;
|
284
|
-
#else
|
285
|
-
return NULL;
|
286
|
-
#endif
|
287
|
-
}
|
288
|
-
|
289
|
-
void frt_print_stacktrace(void) {
|
290
|
-
char *stack = frt_get_stacktrace();
|
291
|
-
|
292
|
-
if (stack) {
|
293
|
-
fprintf(EXCEPTION_STREAM, "Stack trace:\n%s", stack);
|
294
|
-
free(stack);
|
295
|
-
} else {
|
296
|
-
fprintf(EXCEPTION_STREAM, "Stack trace not available\n");
|
297
|
-
}
|
298
|
-
}
|
299
|
-
|
300
235
|
typedef struct FreeMe {
|
301
236
|
void *p;
|
302
237
|
frt_free_ft free_func;
|
@@ -321,55 +256,7 @@ void frt_register_for_cleanup(void *p, frt_free_ft free_func) {
|
|
321
256
|
free_me->free_func = free_func;
|
322
257
|
}
|
323
258
|
|
324
|
-
#define MAX_PROG_NAME 200
|
325
|
-
static char name[MAX_PROG_NAME]; /* program name for error msgs */
|
326
|
-
|
327
|
-
/* frt_setprogname: set stored name of program */
|
328
|
-
void frt_setprogname(const char *str) {
|
329
|
-
strncpy(name, str, sizeof(name) - 1);
|
330
|
-
}
|
331
|
-
|
332
|
-
const char *frt_progname(void) {
|
333
|
-
return name;
|
334
|
-
}
|
335
|
-
|
336
|
-
static const char *signal_to_string(int signum) {
|
337
|
-
switch (signum)
|
338
|
-
{
|
339
|
-
case SIGILL: return "SIGILL";
|
340
|
-
case SIGABRT: return "SIGABRT";
|
341
|
-
case SIGFPE: return "SIGFPE";
|
342
|
-
#if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
|
343
|
-
case SIGBUS: return "SIGBUS";
|
344
|
-
#endif
|
345
|
-
case SIGSEGV: return "SIGSEGV";
|
346
|
-
}
|
347
|
-
|
348
|
-
return "Unknown Signal";
|
349
|
-
}
|
350
|
-
|
351
|
-
static void sighandler_crash(int signum) {
|
352
|
-
frt_print_stacktrace();
|
353
|
-
FRT_XEXIT("Signal", "Exiting on signal %s (%d)", signal_to_string(signum), signum);
|
354
|
-
}
|
355
|
-
|
356
|
-
#define SETSIG_IF_UNSET(sig, handler) do { \
|
357
|
-
signal(sig, handler); \
|
358
|
-
} while(0)
|
359
|
-
|
360
259
|
void frt_init(int argc, const char *const argv[]) {
|
361
|
-
if (argc > 0) {
|
362
|
-
frt_setprogname(argv[0]);
|
363
|
-
}
|
364
|
-
|
365
|
-
SETSIG_IF_UNSET(SIGILL , sighandler_crash);
|
366
|
-
SETSIG_IF_UNSET(SIGABRT, sighandler_crash);
|
367
|
-
SETSIG_IF_UNSET(SIGFPE , sighandler_crash);
|
368
|
-
#if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
|
369
|
-
SETSIG_IF_UNSET(SIGBUS , sighandler_crash);
|
370
|
-
#endif
|
371
|
-
SETSIG_IF_UNSET(SIGSEGV, sighandler_crash);
|
372
|
-
|
373
260
|
atexit(&frt_hash_finalize);
|
374
261
|
|
375
262
|
utf8_encoding = rb_enc_find("UTF-8");
|
@@ -429,41 +316,3 @@ void frt_init(int argc, const char *const argv[]) {
|
|
429
316
|
FRT_SORT_FIELD_DOC_REV->compare = frt_sort_field_doc_compare; /* compare */
|
430
317
|
FRT_SORT_FIELD_DOC_REV->get_val = frt_sort_field_doc_get_val; /* get_val */
|
431
318
|
}
|
432
|
-
|
433
|
-
/**
|
434
|
-
* For general use when testing
|
435
|
-
*
|
436
|
-
* TODO wrap in #ifdef
|
437
|
-
*/
|
438
|
-
|
439
|
-
static bool p_switch = false;
|
440
|
-
static bool p_switch_tmp = false;
|
441
|
-
|
442
|
-
void p(const char *format, ...) {
|
443
|
-
va_list args;
|
444
|
-
|
445
|
-
if (!p_switch) return;
|
446
|
-
|
447
|
-
va_start(args, format);
|
448
|
-
vfprintf(stderr, format, args);
|
449
|
-
va_end(args);
|
450
|
-
}
|
451
|
-
|
452
|
-
void p_on(void) {
|
453
|
-
fprintf(stderr, "> > > > > STARTING PRINT\n");
|
454
|
-
p_switch = true;
|
455
|
-
}
|
456
|
-
|
457
|
-
void p_off(void) {
|
458
|
-
fprintf(stderr, "< < < < < STOPPING PRINT\n");
|
459
|
-
p_switch = false;
|
460
|
-
}
|
461
|
-
|
462
|
-
void frt_p_pause(void) {
|
463
|
-
p_switch_tmp = p_switch;
|
464
|
-
p_switch = false;
|
465
|
-
}
|
466
|
-
|
467
|
-
void frt_p_resume(void) {
|
468
|
-
p_switch = p_switch_tmp;
|
469
|
-
}
|
@@ -105,9 +105,6 @@ extern char *frt_dbl_to_s(char *buf, double num);
|
|
105
105
|
extern char *frt_strfmt(const char *fmt, ...);
|
106
106
|
extern char *frt_vstrfmt(const char *fmt, va_list args);
|
107
107
|
|
108
|
-
extern char *frt_get_stacktrace();
|
109
|
-
extern void frt_print_stacktrace();
|
110
|
-
|
111
108
|
extern void frt_register_for_cleanup(void *p, frt_free_ft free_func);
|
112
109
|
|
113
110
|
/**
|
@@ -277,18 +274,6 @@ extern bool frt_x_do_logging;
|
|
277
274
|
#endif
|
278
275
|
|
279
276
|
extern void frt_init(int arc, const char *const argv[]);
|
280
|
-
extern void frt_setprogname(const char *str);
|
281
|
-
extern const char *frt_progname();
|
282
277
|
extern void frt_micro_sleep(const int micro_seconds);
|
283
278
|
|
284
|
-
/**
|
285
|
-
* For general use during testing. Switch this on and off for print statements
|
286
|
-
* to only print when p_on is called and not after p_off is called
|
287
|
-
*/
|
288
|
-
extern void p(const char *format, ...);
|
289
|
-
extern void p_on();
|
290
|
-
extern void p_off();
|
291
|
-
extern void frt_p_pause();
|
292
|
-
extern void frt_p_resume();
|
293
|
-
|
294
279
|
#endif
|
@@ -14,6 +14,7 @@
|
|
14
14
|
#undef close
|
15
15
|
#undef read
|
16
16
|
|
17
|
+
extern rb_encoding *utf8_encoding;
|
17
18
|
extern void frt_micro_sleep(const int micro_seconds);
|
18
19
|
|
19
20
|
#define GET_LOCK(lock, name, store, err_msg) do {\
|
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
|
|
1710
1711
|
total_len = delta_start + delta_len;
|
1711
1712
|
frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
|
1712
1713
|
buffer[total_len++] = '\0';
|
1713
|
-
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
|
1714
|
-
buffer, total_len);
|
1714
|
+
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
|
1715
1715
|
|
1716
1716
|
/* read freq */
|
1717
1717
|
freq = term->freq = frt_is_read_vint(fdt_in);
|
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
|
|
1822
1822
|
*
|
1823
1823
|
****************************************************************************/
|
1824
1824
|
|
1825
|
-
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
|
1826
|
-
{
|
1825
|
+
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
|
1827
1826
|
FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
|
1828
1827
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
1829
1828
|
size_t segment_len = strlen(segment);
|
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
|
|
1844
1843
|
return fw;
|
1845
1844
|
}
|
1846
1845
|
|
1847
|
-
void frt_fw_close(FrtFieldsWriter *fw)
|
1848
|
-
{
|
1846
|
+
void frt_fw_close(FrtFieldsWriter *fw) {
|
1849
1847
|
frt_os_close(fw->fdt_out);
|
1850
1848
|
frt_os_close(fw->fdx_out);
|
1851
1849
|
frt_ram_destroy_buffer(fw->buffer);
|
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
|
2046
2044
|
frt_ramo_write_to(fw->buffer, fdt_out);
|
2047
2045
|
}
|
2048
2046
|
|
2049
|
-
void frt_fw_write_tv_index(FrtFieldsWriter *fw)
|
2050
|
-
{
|
2047
|
+
void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
|
2051
2048
|
int i;
|
2052
2049
|
const int tv_cnt = frt_ary_size(fw->tv_fields);
|
2053
2050
|
FrtOutStream *fdt_out = fw->fdt_out;
|
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
|
|
5548
5545
|
for (i = 0; i < df_size; i++) {
|
5549
5546
|
int len = df->lengths[i];
|
5550
5547
|
char *data_ptr = df->data[i];
|
5551
|
-
if (
|
5552
|
-
len
|
5553
|
-
|
5548
|
+
if (df->encodings[i] == utf8_encoding) {
|
5549
|
+
if (len >= FRT_MAX_WORD_SIZE) {
|
5550
|
+
len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
|
5551
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5552
|
+
buf[len] = '\0';
|
5553
|
+
}
|
5554
|
+
} else if (df->encodings[i] != utf8_encoding) {
|
5555
|
+
if (len >= FRT_MAX_WORD_SIZE)
|
5556
|
+
len = FRT_MAX_WORD_SIZE - 1;
|
5557
|
+
const unsigned char *sp = (unsigned char *)df->data[i];
|
5558
|
+
unsigned char *dp = (unsigned char *)&buf;
|
5559
|
+
rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
5560
|
+
assert(ec != NULL);
|
5561
|
+
rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
|
5562
|
+
rb_econv_close(ec);
|
5563
|
+
len = dp - (unsigned char *)&buf;
|
5564
|
+
buf[len] = '\0';
|
5565
|
+
data_ptr = buf;
|
5554
5566
|
}
|
5555
5567
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
|
5556
5568
|
if (store_offsets) {
|
@@ -61,7 +61,6 @@ void FRT_VEXIT(const char *err_type, const char *fmt, va_list args)
|
|
61
61
|
# endif
|
62
62
|
{
|
63
63
|
fflush(stdout);
|
64
|
-
fprintf(EXCEPTION_STREAM, "\n%s: ", frt_progname());
|
65
64
|
|
66
65
|
# ifdef FRT_HAS_VARARGS
|
67
66
|
fprintf(EXCEPTION_STREAM, "%s occurred at <%s>:%d in %s\n",
|
@@ -76,7 +75,6 @@ void FRT_VEXIT(const char *err_type, const char *fmt, va_list args)
|
|
76
75
|
}
|
77
76
|
|
78
77
|
fprintf(EXCEPTION_STREAM, "\n");
|
79
|
-
frt_print_stacktrace();
|
80
78
|
if (frt_x_abort_on_exception) {
|
81
79
|
exit(2); /* conventional value for failed execution */
|
82
80
|
}
|
@@ -1286,6 +1286,7 @@ FrtBooleanClause *frt_bc_alloc(void) {
|
|
1286
1286
|
FrtBooleanClause *frt_bc_init(FrtBooleanClause *self, FrtQuery *query, FrtBCType occur) {
|
1287
1287
|
self->ref_cnt = 1;
|
1288
1288
|
self->query = query;
|
1289
|
+
self->rbc = Qnil;
|
1289
1290
|
frt_bc_set_occur(self, occur);
|
1290
1291
|
return self;
|
1291
1292
|
}
|
@@ -279,22 +279,10 @@ static void append_to_msg_buf(const char *fmt, ...)
|
|
279
279
|
va_end(args);
|
280
280
|
}
|
281
281
|
|
282
|
-
|
283
|
-
static void Tstack(void) {
|
284
|
-
if (show_stack) {
|
285
|
-
char *stack = frt_get_stacktrace();
|
286
|
-
if (stack) {
|
287
|
-
append_to_msg_buf("\n\nStack trace:\n%s\n", stack);
|
288
|
-
free(stack);
|
289
|
-
}
|
290
|
-
}
|
291
|
-
}
|
292
|
-
|
293
282
|
static void vTmsg_nf(const char *fmt, va_list args)
|
294
283
|
{
|
295
284
|
if (verbose) {
|
296
285
|
vappend_to_msg_buf(fmt, args);
|
297
|
-
Tstack();
|
298
286
|
}
|
299
287
|
}
|
300
288
|
|
@@ -305,8 +293,6 @@ void vTmsg(const char *fmt, va_list args)
|
|
305
293
|
vappend_to_msg_buf(fmt, args);
|
306
294
|
va_end(args);
|
307
295
|
append_to_msg_buf("\n");
|
308
|
-
|
309
|
-
Tstack();
|
310
296
|
}
|
311
297
|
}
|
312
298
|
|
@@ -348,8 +334,6 @@ void tst_msg(const char *func, const char *fname, int line_num, const char *fmt,
|
|
348
334
|
va_start(args, fmt);
|
349
335
|
vappend_to_msg_buf(fmt, args);
|
350
336
|
va_end(args);
|
351
|
-
|
352
|
-
Tstack();
|
353
337
|
}
|
354
338
|
}
|
355
339
|
|
@@ -145,50 +145,6 @@ static void test_dbl_to_s(TestCase *tc, void *data)
|
|
145
145
|
Asequal("NaN", frt_dbl_to_s(buf, NAN));
|
146
146
|
}
|
147
147
|
|
148
|
-
|
149
|
-
/**
|
150
|
-
* Generate a stacktrace, make sure it does something
|
151
|
-
*/
|
152
|
-
static void test_stacktrace(TestCase *tc, void *data)
|
153
|
-
{
|
154
|
-
FILE *old_stream = frt_x_exception_stream;
|
155
|
-
(void)data; /* suppress warning */
|
156
|
-
int tfd = fio_tmpfile();
|
157
|
-
frt_x_exception_stream = fdopen(tfd, "w+");
|
158
|
-
Atrue(frt_x_exception_stream != NULL);
|
159
|
-
if (frt_x_exception_stream) {
|
160
|
-
frt_print_stacktrace();
|
161
|
-
long int f = ftell(frt_x_exception_stream);
|
162
|
-
Assert(f, "Stream position should not be 0");
|
163
|
-
fclose(frt_x_exception_stream);
|
164
|
-
}
|
165
|
-
frt_x_exception_stream = old_stream;
|
166
|
-
}
|
167
|
-
|
168
|
-
/**
|
169
|
-
* Generate a normally fatal signal, which gets caught
|
170
|
-
*/
|
171
|
-
/*
|
172
|
-
static void test_sighandler(TestCase *tc, void *data)
|
173
|
-
{
|
174
|
-
bool old_abort = frt_x_abort_on_exception;
|
175
|
-
FILE *old_stream = frt_x_exception_stream;
|
176
|
-
(void)data;
|
177
|
-
(void)tc;
|
178
|
-
|
179
|
-
frt_x_exception_stream = false;
|
180
|
-
frt_x_exception_stream = tmpfile();
|
181
|
-
|
182
|
-
raise(SIGSEGV);
|
183
|
-
|
184
|
-
Assert(ftell(frt_x_exception_stream), "Stream position should not be 0");
|
185
|
-
fclose(frt_x_exception_stream);
|
186
|
-
|
187
|
-
frt_x_exception_stream = old_stream;
|
188
|
-
frt_x_abort_on_exception = old_abort;
|
189
|
-
}
|
190
|
-
*/
|
191
|
-
|
192
148
|
static void test_count_leading_zeros(TestCase *tc, void *data)
|
193
149
|
{
|
194
150
|
(void)data;
|
@@ -284,8 +240,6 @@ TestSuite *ts_global(TestSuite *suite)
|
|
284
240
|
tst_run_test(suite, test_alloc, NULL);
|
285
241
|
tst_run_test(suite, test_strfmt, NULL);
|
286
242
|
tst_run_test(suite, test_dbl_to_s, NULL);
|
287
|
-
tst_run_test(suite, test_stacktrace, NULL);
|
288
|
-
// tst_run_test(suite, test_sighandler, NULL);
|
289
243
|
tst_run_test(suite, test_count_leading_zeros, NULL);
|
290
244
|
tst_run_test(suite, test_count_leading_ones, NULL);
|
291
245
|
tst_run_test(suite, test_count_trailing_zeros, NULL);
|
@@ -266,7 +266,6 @@ void tst_check_hits(TestCase *tc, FrtSearcher *searcher, FrtQuery *query, const
|
|
266
266
|
int i, count;
|
267
267
|
int total_hits = s2l(expected_hits, num_array);
|
268
268
|
FrtTopDocs *top_docs = frt_searcher_search(searcher, query, 0, total_hits + 1, NULL, NULL, NULL);
|
269
|
-
frt_p_pause();
|
270
269
|
if (!tc->failed && !Aiequal(total_hits, top_docs->total_hits)) {
|
271
270
|
int i;
|
272
271
|
Tmsg_nf("\texpected docs:\n\t ");
|
@@ -314,7 +313,6 @@ void tst_check_hits(TestCase *tc, FrtSearcher *searcher, FrtQuery *query, const
|
|
314
313
|
count = frt_searcher_search_unscored(searcher, query, num_array2, ARRAY_SIZE, num_array2[3]);
|
315
314
|
Aaiequal(num_array + 3, num_array2, count);
|
316
315
|
}
|
317
|
-
frt_p_resume();
|
318
316
|
}
|
319
317
|
|
320
318
|
void check_match_vector(TestCase *tc, FrtSearcher *searcher, FrtQuery *query,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isomorfeus-ferret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Biedermann
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -154,13 +154,13 @@ files:
|
|
154
154
|
- ext/isomorfeus_ferret_ext/brotli_encode.h
|
155
155
|
- ext/isomorfeus_ferret_ext/brotli_port.h
|
156
156
|
- ext/isomorfeus_ferret_ext/brotli_types.h
|
157
|
-
- ext/isomorfeus_ferret_ext/bzip_blocksort.c
|
158
|
-
- ext/isomorfeus_ferret_ext/bzip_huffman.c
|
159
157
|
- ext/isomorfeus_ferret_ext/bzlib.c
|
160
158
|
- ext/isomorfeus_ferret_ext/bzlib.h
|
159
|
+
- ext/isomorfeus_ferret_ext/bzlib_blocksort.c
|
161
160
|
- ext/isomorfeus_ferret_ext/bzlib_compress.c
|
162
161
|
- ext/isomorfeus_ferret_ext/bzlib_crctable.c
|
163
162
|
- ext/isomorfeus_ferret_ext/bzlib_decompress.c
|
163
|
+
- ext/isomorfeus_ferret_ext/bzlib_huffman.c
|
164
164
|
- ext/isomorfeus_ferret_ext/bzlib_private.h
|
165
165
|
- ext/isomorfeus_ferret_ext/bzlib_randtable.c
|
166
166
|
- ext/isomorfeus_ferret_ext/extconf.rb
|