isomorfeus-ferret 0.13.1 → 0.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/isomorfeus_ferret_ext/frb_index.c +5 -21
- data/ext/isomorfeus_ferret_ext/frt_index.c +23 -11
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1a8509c12f5d180d38944adb53291958a9f8327a8b1706def9fecd7f9c60e73
|
4
|
+
data.tar.gz: fbbd38e08dd1992cd93663b1b04cc4b666b6a6dc59104414b75515bc2ec4d54d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7158cdc3f7a0624d35b668b31287d489ee3d389f17a2ab1b58235a7c2be639b6a7bbb3a8c0efee6d168804128fa5c0d7955c00f860b653a76f23d6421fc84c5d
|
7
|
+
data.tar.gz: 60c62db42081291a52e66be4b7e1b4c46eb12a458b0838f3d8793ffaeb013aff27a37aca06ab4831157d1dd65330395bfdaa2fa7924a278261f1187295592042
|
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
|
|
64
64
|
static ID id_field_num;
|
65
65
|
static ID id_boost;
|
66
66
|
|
67
|
+
extern rb_encoding *utf8_encoding;
|
67
68
|
extern void frb_set_term(VALUE rterm, FrtTerm *t);
|
68
69
|
extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
|
69
70
|
extern VALUE frb_get_analyzer(FrtAnalyzer *a);
|
@@ -1219,6 +1220,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
|
|
1219
1220
|
VALUE rtext;
|
1220
1221
|
VALUE rpositions = Qnil;
|
1221
1222
|
rtext = rb_str_new2(tv_term->text);
|
1223
|
+
rb_enc_associate(rtext, utf8_encoding);
|
1222
1224
|
if (tv_term->positions) {
|
1223
1225
|
int *positions = tv_term->positions;
|
1224
1226
|
rpositions = rb_ary_new2(freq);
|
@@ -3085,10 +3087,6 @@ static void Init_TermDocEnum(void) {
|
|
3085
3087
|
rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
|
3086
3088
|
}
|
3087
3089
|
|
3088
|
-
/* rdochack
|
3089
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3090
|
-
*/
|
3091
|
-
|
3092
3090
|
/*
|
3093
3091
|
* Document-class: Ferret::Index::TermVector::TVOffsets
|
3094
3092
|
*
|
@@ -3107,9 +3105,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
|
3107
3105
|
*/
|
3108
3106
|
static void Init_TVOffsets(void) {
|
3109
3107
|
const char *tv_offsets_class = "TVOffsets";
|
3110
|
-
/* rdochack
|
3111
|
-
cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
|
3112
|
-
*/
|
3113
3108
|
cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
|
3114
3109
|
rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
|
3115
3110
|
rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
|
@@ -3130,13 +3125,8 @@ static void Init_TVOffsets(void) {
|
|
3130
3125
|
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
3131
3126
|
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
3132
3127
|
*/
|
3133
|
-
static void
|
3134
|
-
Init_TVTerm(void)
|
3135
|
-
{
|
3128
|
+
static void Init_TVTerm(void) {
|
3136
3129
|
const char *tv_term_class = "TVTerm";
|
3137
|
-
/* rdochack
|
3138
|
-
cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
|
3139
|
-
*/
|
3140
3130
|
cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
|
3141
3131
|
rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
|
3142
3132
|
rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
|
@@ -3172,15 +3162,9 @@ Init_TVTerm(void)
|
|
3172
3162
|
* particular that you need to store both positions and offsets if you want
|
3173
3163
|
* to associate offsets with particular terms.
|
3174
3164
|
*/
|
3175
|
-
static void
|
3176
|
-
Init_TermVector(void)
|
3177
|
-
{
|
3165
|
+
static void Init_TermVector(void) {
|
3178
3166
|
const char *tv_class = "TermVector";
|
3179
|
-
|
3180
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3181
|
-
*/
|
3182
|
-
cTermVector = rb_struct_define(tv_class,
|
3183
|
-
"field", "terms", "offsets", NULL);
|
3167
|
+
cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
|
3184
3168
|
rb_set_class_path(cTermVector, mIndex, tv_class);
|
3185
3169
|
rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
|
3186
3170
|
|
@@ -14,6 +14,7 @@
|
|
14
14
|
#undef close
|
15
15
|
#undef read
|
16
16
|
|
17
|
+
extern rb_encoding *utf8_encoding;
|
17
18
|
extern void frt_micro_sleep(const int micro_seconds);
|
18
19
|
|
19
20
|
#define GET_LOCK(lock, name, store, err_msg) do {\
|
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
|
|
1710
1711
|
total_len = delta_start + delta_len;
|
1711
1712
|
frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
|
1712
1713
|
buffer[total_len++] = '\0';
|
1713
|
-
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
|
1714
|
-
buffer, total_len);
|
1714
|
+
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
|
1715
1715
|
|
1716
1716
|
/* read freq */
|
1717
1717
|
freq = term->freq = frt_is_read_vint(fdt_in);
|
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
|
|
1822
1822
|
*
|
1823
1823
|
****************************************************************************/
|
1824
1824
|
|
1825
|
-
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
|
1826
|
-
{
|
1825
|
+
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
|
1827
1826
|
FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
|
1828
1827
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
1829
1828
|
size_t segment_len = strlen(segment);
|
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
|
|
1844
1843
|
return fw;
|
1845
1844
|
}
|
1846
1845
|
|
1847
|
-
void frt_fw_close(FrtFieldsWriter *fw)
|
1848
|
-
{
|
1846
|
+
void frt_fw_close(FrtFieldsWriter *fw) {
|
1849
1847
|
frt_os_close(fw->fdt_out);
|
1850
1848
|
frt_os_close(fw->fdx_out);
|
1851
1849
|
frt_ram_destroy_buffer(fw->buffer);
|
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
|
2046
2044
|
frt_ramo_write_to(fw->buffer, fdt_out);
|
2047
2045
|
}
|
2048
2046
|
|
2049
|
-
void frt_fw_write_tv_index(FrtFieldsWriter *fw)
|
2050
|
-
{
|
2047
|
+
void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
|
2051
2048
|
int i;
|
2052
2049
|
const int tv_cnt = frt_ary_size(fw->tv_fields);
|
2053
2050
|
FrtOutStream *fdt_out = fw->fdt_out;
|
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
|
|
5548
5545
|
for (i = 0; i < df_size; i++) {
|
5549
5546
|
int len = df->lengths[i];
|
5550
5547
|
char *data_ptr = df->data[i];
|
5551
|
-
if (
|
5552
|
-
len
|
5553
|
-
|
5548
|
+
if (df->encodings[i] == utf8_encoding) {
|
5549
|
+
if (len >= FRT_MAX_WORD_SIZE) {
|
5550
|
+
len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
|
5551
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5552
|
+
buf[len] = '\0';
|
5553
|
+
}
|
5554
|
+
} else if (df->encodings[i] != utf8_encoding) {
|
5555
|
+
if (len >= FRT_MAX_WORD_SIZE)
|
5556
|
+
len = FRT_MAX_WORD_SIZE - 1;
|
5557
|
+
const unsigned char *sp = (unsigned char *)df->data[i];
|
5558
|
+
unsigned char *dp = (unsigned char *)&buf;
|
5559
|
+
rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
5560
|
+
assert(ec != NULL);
|
5561
|
+
rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
|
5562
|
+
rb_econv_close(ec);
|
5563
|
+
len = dp - (unsigned char *)&buf;
|
5564
|
+
buf[len] = '\0';
|
5565
|
+
data_ptr = buf;
|
5554
5566
|
}
|
5555
5567
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
|
5556
5568
|
if (store_offsets) {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isomorfeus-ferret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Biedermann
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|