isomorfeus-ferret 0.13.1 → 0.13.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/isomorfeus_ferret_ext/frb_index.c +5 -21
- data/ext/isomorfeus_ferret_ext/frt_index.c +23 -11
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1a8509c12f5d180d38944adb53291958a9f8327a8b1706def9fecd7f9c60e73
|
4
|
+
data.tar.gz: fbbd38e08dd1992cd93663b1b04cc4b666b6a6dc59104414b75515bc2ec4d54d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7158cdc3f7a0624d35b668b31287d489ee3d389f17a2ab1b58235a7c2be639b6a7bbb3a8c0efee6d168804128fa5c0d7955c00f860b653a76f23d6421fc84c5d
|
7
|
+
data.tar.gz: 60c62db42081291a52e66be4b7e1b4c46eb12a458b0838f3d8793ffaeb013aff27a37aca06ab4831157d1dd65330395bfdaa2fa7924a278261f1187295592042
|
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
|
|
64
64
|
static ID id_field_num;
|
65
65
|
static ID id_boost;
|
66
66
|
|
67
|
+
extern rb_encoding *utf8_encoding;
|
67
68
|
extern void frb_set_term(VALUE rterm, FrtTerm *t);
|
68
69
|
extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
|
69
70
|
extern VALUE frb_get_analyzer(FrtAnalyzer *a);
|
@@ -1219,6 +1220,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
|
|
1219
1220
|
VALUE rtext;
|
1220
1221
|
VALUE rpositions = Qnil;
|
1221
1222
|
rtext = rb_str_new2(tv_term->text);
|
1223
|
+
rb_enc_associate(rtext, utf8_encoding);
|
1222
1224
|
if (tv_term->positions) {
|
1223
1225
|
int *positions = tv_term->positions;
|
1224
1226
|
rpositions = rb_ary_new2(freq);
|
@@ -3085,10 +3087,6 @@ static void Init_TermDocEnum(void) {
|
|
3085
3087
|
rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
|
3086
3088
|
}
|
3087
3089
|
|
3088
|
-
/* rdochack
|
3089
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3090
|
-
*/
|
3091
|
-
|
3092
3090
|
/*
|
3093
3091
|
* Document-class: Ferret::Index::TermVector::TVOffsets
|
3094
3092
|
*
|
@@ -3107,9 +3105,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
|
3107
3105
|
*/
|
3108
3106
|
static void Init_TVOffsets(void) {
|
3109
3107
|
const char *tv_offsets_class = "TVOffsets";
|
3110
|
-
/* rdochack
|
3111
|
-
cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
|
3112
|
-
*/
|
3113
3108
|
cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
|
3114
3109
|
rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
|
3115
3110
|
rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
|
@@ -3130,13 +3125,8 @@ static void Init_TVOffsets(void) {
|
|
3130
3125
|
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
3131
3126
|
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
3132
3127
|
*/
|
3133
|
-
static void
|
3134
|
-
Init_TVTerm(void)
|
3135
|
-
{
|
3128
|
+
static void Init_TVTerm(void) {
|
3136
3129
|
const char *tv_term_class = "TVTerm";
|
3137
|
-
/* rdochack
|
3138
|
-
cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
|
3139
|
-
*/
|
3140
3130
|
cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
|
3141
3131
|
rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
|
3142
3132
|
rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
|
@@ -3172,15 +3162,9 @@ Init_TVTerm(void)
|
|
3172
3162
|
* particular that you need to store both positions and offsets if you want
|
3173
3163
|
* to associate offsets with particular terms.
|
3174
3164
|
*/
|
3175
|
-
static void
|
3176
|
-
Init_TermVector(void)
|
3177
|
-
{
|
3165
|
+
static void Init_TermVector(void) {
|
3178
3166
|
const char *tv_class = "TermVector";
|
3179
|
-
|
3180
|
-
cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
|
3181
|
-
*/
|
3182
|
-
cTermVector = rb_struct_define(tv_class,
|
3183
|
-
"field", "terms", "offsets", NULL);
|
3167
|
+
cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
|
3184
3168
|
rb_set_class_path(cTermVector, mIndex, tv_class);
|
3185
3169
|
rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
|
3186
3170
|
|
@@ -14,6 +14,7 @@
|
|
14
14
|
#undef close
|
15
15
|
#undef read
|
16
16
|
|
17
|
+
extern rb_encoding *utf8_encoding;
|
17
18
|
extern void frt_micro_sleep(const int micro_seconds);
|
18
19
|
|
19
20
|
#define GET_LOCK(lock, name, store, err_msg) do {\
|
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
|
|
1710
1711
|
total_len = delta_start + delta_len;
|
1711
1712
|
frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
|
1712
1713
|
buffer[total_len++] = '\0';
|
1713
|
-
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
|
1714
|
-
buffer, total_len);
|
1714
|
+
term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
|
1715
1715
|
|
1716
1716
|
/* read freq */
|
1717
1717
|
freq = term->freq = frt_is_read_vint(fdt_in);
|
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
|
|
1822
1822
|
*
|
1823
1823
|
****************************************************************************/
|
1824
1824
|
|
1825
|
-
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
|
1826
|
-
{
|
1825
|
+
FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
|
1827
1826
|
FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
|
1828
1827
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
1829
1828
|
size_t segment_len = strlen(segment);
|
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
|
|
1844
1843
|
return fw;
|
1845
1844
|
}
|
1846
1845
|
|
1847
|
-
void frt_fw_close(FrtFieldsWriter *fw)
|
1848
|
-
{
|
1846
|
+
void frt_fw_close(FrtFieldsWriter *fw) {
|
1849
1847
|
frt_os_close(fw->fdt_out);
|
1850
1848
|
frt_os_close(fw->fdx_out);
|
1851
1849
|
frt_ram_destroy_buffer(fw->buffer);
|
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
|
2046
2044
|
frt_ramo_write_to(fw->buffer, fdt_out);
|
2047
2045
|
}
|
2048
2046
|
|
2049
|
-
void frt_fw_write_tv_index(FrtFieldsWriter *fw)
|
2050
|
-
{
|
2047
|
+
void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
|
2051
2048
|
int i;
|
2052
2049
|
const int tv_cnt = frt_ary_size(fw->tv_fields);
|
2053
2050
|
FrtOutStream *fdt_out = fw->fdt_out;
|
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
|
|
5548
5545
|
for (i = 0; i < df_size; i++) {
|
5549
5546
|
int len = df->lengths[i];
|
5550
5547
|
char *data_ptr = df->data[i];
|
5551
|
-
if (
|
5552
|
-
len
|
5553
|
-
|
5548
|
+
if (df->encodings[i] == utf8_encoding) {
|
5549
|
+
if (len >= FRT_MAX_WORD_SIZE) {
|
5550
|
+
len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
|
5551
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5552
|
+
buf[len] = '\0';
|
5553
|
+
}
|
5554
|
+
} else if (df->encodings[i] != utf8_encoding) {
|
5555
|
+
if (len >= FRT_MAX_WORD_SIZE)
|
5556
|
+
len = FRT_MAX_WORD_SIZE - 1;
|
5557
|
+
const unsigned char *sp = (unsigned char *)df->data[i];
|
5558
|
+
unsigned char *dp = (unsigned char *)&buf;
|
5559
|
+
rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
5560
|
+
assert(ec != NULL);
|
5561
|
+
rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
|
5562
|
+
rb_econv_close(ec);
|
5563
|
+
len = dp - (unsigned char *)&buf;
|
5564
|
+
buf[len] = '\0';
|
5565
|
+
data_ptr = buf;
|
5554
5566
|
}
|
5555
5567
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
|
5556
5568
|
if (store_offsets) {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isomorfeus-ferret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Biedermann
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|