isomorfeus-ferret 0.14.1 → 0.14.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9954cd3c9b84e7689861a8039c8305407b218d683f9cdfc74ee77542ae6f125c
|
4
|
+
data.tar.gz: 8bcf947f08f84fa7c73e157caaa254a8e21e4700b0c852004613e8c9c9517716
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '083b7943b6d5d1dd59d114d5db4dba0d54f1684a7057ce6358b105ac9564908310bdd2df2539791ed30142428b4ec53ab646d2980eb8549998f2cb586b8c6892'
|
7
|
+
data.tar.gz: e5e15c37ffa004850a18eacae06f95535b460522166a1bc4ec611d3a4dae0aaddb7180d06f78d479e397708a167a5372d093ad2173a026d5888a7c2d34b3aef0
|
@@ -2548,7 +2548,7 @@ static void Init_TVTerm(void) {
|
|
2548
2548
|
* == Example
|
2549
2549
|
*
|
2550
2550
|
* tv = index_reader.term_vector(doc_id, :content)
|
2551
|
-
* tv_term = tv.find {|tvt| tvt.term
|
2551
|
+
* tv_term = tv.find {|tvt| tvt.term == "fox"}
|
2552
2552
|
*
|
2553
2553
|
* # get the term frequency
|
2554
2554
|
* term_freq = tv_term.positions.size
|
@@ -66,7 +66,8 @@ static inline int get_cp(char *start, char *end, int *cp_len, rb_encoding *enc)
|
|
66
66
|
|
67
67
|
FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, frt_off_t start, frt_off_t end, int pos_inc, rb_encoding *encoding) {
|
68
68
|
if (tlen >= FRT_MAX_WORD_SIZE) {
|
69
|
-
|
69
|
+
char *head_last = rb_enc_left_char_head(text, text + FRT_MAX_WORD_SIZE - 1, text + tlen, encoding);
|
70
|
+
tlen = head_last - text;
|
70
71
|
}
|
71
72
|
|
72
73
|
if (encoding == utf8_encoding) {
|
@@ -1031,9 +1032,9 @@ static FrtToken *stemf_next(FrtTokenStream *ts) {
|
|
1031
1032
|
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
|
1032
1033
|
len = sb_stemmer_length(stemmer);
|
1033
1034
|
if (len >= FRT_MAX_WORD_SIZE) {
|
1034
|
-
|
1035
|
+
char *head_last = rb_enc_left_char_head(tk->text, tk->text + FRT_MAX_WORD_SIZE - 1, tk->text + len, utf8_encoding);
|
1036
|
+
len = head_last - tk->text;
|
1035
1037
|
}
|
1036
|
-
|
1037
1038
|
memcpy(tk->text, stemmed, len);
|
1038
1039
|
tk->text[len] = '\0';
|
1039
1040
|
tk->len = len;
|
@@ -261,30 +261,30 @@ void frt_register_for_cleanup(void *p, frt_free_ft free_func) {
|
|
261
261
|
void frt_init(int argc, const char *const argv[]) {
|
262
262
|
atexit(&frt_hash_finalize);
|
263
263
|
|
264
|
-
utf8_encoding =
|
264
|
+
utf8_encoding = rb_utf8_encoding();
|
265
265
|
utf8_mbmaxlen = rb_enc_mbmaxlen(utf8_encoding);
|
266
266
|
char *p = "'";
|
267
267
|
cp_apostrophe = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
|
268
|
-
|
269
|
-
cp_dot = rb_enc_mbc_to_codepoint(
|
270
|
-
|
271
|
-
cp_comma = rb_enc_mbc_to_codepoint(
|
272
|
-
|
273
|
-
cp_backslash = rb_enc_mbc_to_codepoint(
|
274
|
-
|
275
|
-
cp_slash = rb_enc_mbc_to_codepoint(
|
276
|
-
|
277
|
-
cp_underscore = rb_enc_mbc_to_codepoint(
|
278
|
-
|
279
|
-
cp_dash = rb_enc_mbc_to_codepoint(
|
280
|
-
|
281
|
-
cp_hyphen = rb_enc_mbc_to_codepoint(
|
282
|
-
|
283
|
-
cp_at = rb_enc_mbc_to_codepoint(
|
284
|
-
|
285
|
-
cp_ampersand = rb_enc_mbc_to_codepoint(
|
286
|
-
|
287
|
-
cp_colon = rb_enc_mbc_to_codepoint(
|
268
|
+
char *q = ".";
|
269
|
+
cp_dot = rb_enc_mbc_to_codepoint(q, q + 1, utf8_encoding);
|
270
|
+
char *r = ",";
|
271
|
+
cp_comma = rb_enc_mbc_to_codepoint(r, r + 1, utf8_encoding);
|
272
|
+
char *s = "\\";
|
273
|
+
cp_backslash = rb_enc_mbc_to_codepoint(s, s + 1, utf8_encoding);
|
274
|
+
char *t = "/";
|
275
|
+
cp_slash = rb_enc_mbc_to_codepoint(t, t + 1, utf8_encoding);
|
276
|
+
char *u = "_";
|
277
|
+
cp_underscore = rb_enc_mbc_to_codepoint(u, u + 1, utf8_encoding);
|
278
|
+
char *v = "-";
|
279
|
+
cp_dash = rb_enc_mbc_to_codepoint(v, v + 1, utf8_encoding);
|
280
|
+
char *w = "\u2010";
|
281
|
+
cp_hyphen = rb_enc_mbc_to_codepoint(w, w + 1, utf8_encoding);
|
282
|
+
char *x = "@";
|
283
|
+
cp_at = rb_enc_mbc_to_codepoint(x, x + 1, utf8_encoding);
|
284
|
+
char *y = "&";
|
285
|
+
cp_ampersand = rb_enc_mbc_to_codepoint(y, y + 1, utf8_encoding);
|
286
|
+
char *z = ":";
|
287
|
+
cp_colon = rb_enc_mbc_to_codepoint(z, z + 1, utf8_encoding);
|
288
288
|
|
289
289
|
FRT_SORT_FIELD_SCORE = frt_sort_field_alloc();
|
290
290
|
FRT_SORT_FIELD_SCORE->field_index_class = NULL; /* field_index_class */
|
@@ -842,7 +842,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf, void
|
|
842
842
|
volatile frt_i64 last_gen = -1;
|
843
843
|
volatile frt_i64 gen = 0;
|
844
844
|
|
845
|
-
/* Loop until we succeed in calling
|
845
|
+
/* Loop until we succeed in calling run() without hitting an
|
846
846
|
* IOException. An IOException most likely means a commit was in process
|
847
847
|
* and has finished, in the time it took us to load the now-old infos
|
848
848
|
* files (and segments files). It's also possible it's a true error
|
@@ -5047,6 +5047,7 @@ FrtPosting *frt_p_new(FrtMemoryPool *mp, int doc_num, int pos)
|
|
5047
5047
|
FrtPostingList *frt_pl_new(FrtMemoryPool *mp, const char *term,
|
5048
5048
|
int term_len, FrtPosting *p)
|
5049
5049
|
{
|
5050
|
+
// TODO account for term_len as measured in the original text vs utf8 term_len of term
|
5050
5051
|
FrtPostingList *pl = FRT_MP_ALLOC(mp, FrtPostingList);
|
5051
5052
|
pl->term = (char *)frt_mp_memdup(mp, term, term_len + 1);
|
5052
5053
|
pl->term_len = term_len;
|
@@ -5449,15 +5450,14 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
|
|
5449
5450
|
for (i = 0; i < df_size; i++) {
|
5450
5451
|
int len = df->lengths[i];
|
5451
5452
|
char *data_ptr = df->data[i];
|
5453
|
+
if (len >= FRT_MAX_WORD_SIZE) {
|
5454
|
+
char *head_last = rb_enc_left_char_head(data_ptr, data_ptr + FRT_MAX_WORD_SIZE - 1, data_ptr + len, df->encodings[i]);
|
5455
|
+
len = head_last - data_ptr;
|
5456
|
+
}
|
5452
5457
|
if (df->encodings[i] == utf8_encoding) {
|
5453
|
-
|
5454
|
-
|
5455
|
-
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5456
|
-
buf[len] = '\0';
|
5457
|
-
}
|
5458
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5459
|
+
buf[len] = '\0';
|
5458
5460
|
} else if (df->encodings[i] != utf8_encoding) {
|
5459
|
-
if (len >= FRT_MAX_WORD_SIZE)
|
5460
|
-
len = FRT_MAX_WORD_SIZE - 1;
|
5461
5461
|
const unsigned char *sp = (unsigned char *)df->data[i];
|
5462
5462
|
unsigned char *dp = (unsigned char *)&buf;
|
5463
5463
|
rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isomorfeus-ferret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Biedermann
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: oj
|
@@ -381,13 +381,7 @@ licenses:
|
|
381
381
|
metadata:
|
382
382
|
github_repo: ssh://github.com/isomorfeus/gems
|
383
383
|
source_code_uri: https://github.com/isomorfeus/isomorfeus-ferret
|
384
|
-
post_install_message:
|
385
|
-
|
386
|
-
isomorfeus-ferret 0.13:
|
387
|
-
Breaking change:
|
388
|
-
To support Ruby string encodings, die index file format has changed.
|
389
|
-
Indexes created with previous versions < 0.13 must be recreated with 0.13!
|
390
|
-
|
384
|
+
post_install_message:
|
391
385
|
rdoc_options: []
|
392
386
|
require_paths:
|
393
387
|
- lib
|
@@ -407,4 +401,3 @@ signing_key:
|
|
407
401
|
specification_version: 4
|
408
402
|
summary: Indexed document store for Isomorfeus.
|
409
403
|
test_files: []
|
410
|
-
...
|