isomorfeus-ferret 0.14.1 → 0.14.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5818fce6d84b9bd4814be3bbed270127e05297dcf85adeebc495c8f334430d88
4
- data.tar.gz: 77c9c3246c7777947084b47620d3aeeeb9eb76d7b0a17a4d30a37a38547a54da
3
+ metadata.gz: 9954cd3c9b84e7689861a8039c8305407b218d683f9cdfc74ee77542ae6f125c
4
+ data.tar.gz: 8bcf947f08f84fa7c73e157caaa254a8e21e4700b0c852004613e8c9c9517716
5
5
  SHA512:
6
- metadata.gz: 59632a0b46b9bd247da0f8b3908654a8027fbcef2aadc897f7681d25b03d4404191d037be323f666ef9bae679c72b135318aa853158e6bf0205b754ec3b2b18f
7
- data.tar.gz: 2a037003347c6bca0900bf80410e83f43d397400f37e22f112e6ef6893a568dba29561b12594f803f3b28baee9f5f1ae67595c244d91b7dffa9d06e4e493c891
6
+ metadata.gz: '083b7943b6d5d1dd59d114d5db4dba0d54f1684a7057ce6358b105ac9564908310bdd2df2539791ed30142428b4ec53ab646d2980eb8549998f2cb586b8c6892'
7
+ data.tar.gz: e5e15c37ffa004850a18eacae06f95535b460522166a1bc4ec611d3a4dae0aaddb7180d06f78d479e397708a167a5372d093ad2173a026d5888a7c2d34b3aef0
@@ -2548,7 +2548,7 @@ static void Init_TVTerm(void) {
2548
2548
  * == Example
2549
2549
  *
2550
2550
  * tv = index_reader.term_vector(doc_id, :content)
2551
- * tv_term = tv.find {|tvt| tvt.term = "fox"}
2551
+ * tv_term = tv.find {|tvt| tvt.term == "fox"}
2552
2552
  *
2553
2553
  * # get the term frequency
2554
2554
  * term_freq = tv_term.positions.size
@@ -66,7 +66,8 @@ static inline int get_cp(char *start, char *end, int *cp_len, rb_encoding *enc)
66
66
 
67
67
  FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, frt_off_t start, frt_off_t end, int pos_inc, rb_encoding *encoding) {
68
68
  if (tlen >= FRT_MAX_WORD_SIZE) {
69
- tlen = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
69
+ char *head_last = rb_enc_left_char_head(text, text + FRT_MAX_WORD_SIZE - 1, text + tlen, encoding);
70
+ tlen = head_last - text;
70
71
  }
71
72
 
72
73
  if (encoding == utf8_encoding) {
@@ -1031,9 +1032,9 @@ static FrtToken *stemf_next(FrtTokenStream *ts) {
1031
1032
  stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
1032
1033
  len = sb_stemmer_length(stemmer);
1033
1034
  if (len >= FRT_MAX_WORD_SIZE) {
1034
- len = FRT_MAX_WORD_SIZE - 1;
1035
+ char *head_last = rb_enc_left_char_head(tk->text, tk->text + FRT_MAX_WORD_SIZE - 1, tk->text + len, utf8_encoding);
1036
+ len = head_last - tk->text;
1035
1037
  }
1036
-
1037
1038
  memcpy(tk->text, stemmed, len);
1038
1039
  tk->text[len] = '\0';
1039
1040
  tk->len = len;
@@ -261,30 +261,30 @@ void frt_register_for_cleanup(void *p, frt_free_ft free_func) {
261
261
  void frt_init(int argc, const char *const argv[]) {
262
262
  atexit(&frt_hash_finalize);
263
263
 
264
- utf8_encoding = rb_enc_find("UTF-8");
264
+ utf8_encoding = rb_utf8_encoding();
265
265
  utf8_mbmaxlen = rb_enc_mbmaxlen(utf8_encoding);
266
266
  char *p = "'";
267
267
  cp_apostrophe = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
268
- p = ".";
269
- cp_dot = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
270
- p = ",";
271
- cp_comma = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
272
- p = "\\";
273
- cp_backslash = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
274
- p = "/";
275
- cp_slash = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
276
- p = "_";
277
- cp_underscore = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
278
- p = "-";
279
- cp_dash = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
280
- p = "\u2010";
281
- cp_hyphen = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
282
- p = "@";
283
- cp_at = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
284
- p = "&";
285
- cp_ampersand = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
286
- p = ":";
287
- cp_colon = rb_enc_mbc_to_codepoint(p, p + 1, utf8_encoding);
268
+ char *q = ".";
269
+ cp_dot = rb_enc_mbc_to_codepoint(q, q + 1, utf8_encoding);
270
+ char *r = ",";
271
+ cp_comma = rb_enc_mbc_to_codepoint(r, r + 1, utf8_encoding);
272
+ char *s = "\\";
273
+ cp_backslash = rb_enc_mbc_to_codepoint(s, s + 1, utf8_encoding);
274
+ char *t = "/";
275
+ cp_slash = rb_enc_mbc_to_codepoint(t, t + 1, utf8_encoding);
276
+ char *u = "_";
277
+ cp_underscore = rb_enc_mbc_to_codepoint(u, u + 1, utf8_encoding);
278
+ char *v = "-";
279
+ cp_dash = rb_enc_mbc_to_codepoint(v, v + 1, utf8_encoding);
280
+ char *w = "\u2010";
281
+ cp_hyphen = rb_enc_mbc_to_codepoint(w, w + 1, utf8_encoding);
282
+ char *x = "@";
283
+ cp_at = rb_enc_mbc_to_codepoint(x, x + 1, utf8_encoding);
284
+ char *y = "&";
285
+ cp_ampersand = rb_enc_mbc_to_codepoint(y, y + 1, utf8_encoding);
286
+ char *z = ":";
287
+ cp_colon = rb_enc_mbc_to_codepoint(z, z + 1, utf8_encoding);
288
288
 
289
289
  FRT_SORT_FIELD_SCORE = frt_sort_field_alloc();
290
290
  FRT_SORT_FIELD_SCORE->field_index_class = NULL; /* field_index_class */
@@ -842,7 +842,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf, void
842
842
  volatile frt_i64 last_gen = -1;
843
843
  volatile frt_i64 gen = 0;
844
844
 
845
- /* Loop until we succeed in calling doBody() without hitting an
845
+ /* Loop until we succeed in calling run() without hitting an
846
846
  * IOException. An IOException most likely means a commit was in process
847
847
  * and has finished, in the time it took us to load the now-old infos
848
848
  * files (and segments files). It's also possible it's a true error
@@ -5047,6 +5047,7 @@ FrtPosting *frt_p_new(FrtMemoryPool *mp, int doc_num, int pos)
5047
5047
  FrtPostingList *frt_pl_new(FrtMemoryPool *mp, const char *term,
5048
5048
  int term_len, FrtPosting *p)
5049
5049
  {
5050
+ // TODO account for term_len as measured in the original text vs utf8 term_len of term
5050
5051
  FrtPostingList *pl = FRT_MP_ALLOC(mp, FrtPostingList);
5051
5052
  pl->term = (char *)frt_mp_memdup(mp, term, term_len + 1);
5052
5053
  pl->term_len = term_len;
@@ -5449,15 +5450,14 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
5449
5450
  for (i = 0; i < df_size; i++) {
5450
5451
  int len = df->lengths[i];
5451
5452
  char *data_ptr = df->data[i];
5453
+ if (len >= FRT_MAX_WORD_SIZE) {
5454
+ char *head_last = rb_enc_left_char_head(data_ptr, data_ptr + FRT_MAX_WORD_SIZE - 1, data_ptr + len, df->encodings[i]);
5455
+ len = head_last - data_ptr;
5456
+ }
5452
5457
  if (df->encodings[i] == utf8_encoding) {
5453
- if (len >= FRT_MAX_WORD_SIZE) {
5454
- len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
5455
- data_ptr = (char *)memcpy(buf, df->data[i], len);
5456
- buf[len] = '\0';
5457
- }
5458
+ data_ptr = (char *)memcpy(buf, df->data[i], len);
5459
+ buf[len] = '\0';
5458
5460
  } else if (df->encodings[i] != utf8_encoding) {
5459
- if (len >= FRT_MAX_WORD_SIZE)
5460
- len = FRT_MAX_WORD_SIZE - 1;
5461
5461
  const unsigned char *sp = (unsigned char *)df->data[i];
5462
5462
  unsigned char *dp = (unsigned char *)&buf;
5463
5463
  rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
@@ -1,5 +1,5 @@
1
1
  module Isomorfeus
2
2
  module Ferret
3
- VERSION = '0.14.1'
3
+ VERSION = '0.14.2'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isomorfeus-ferret
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Biedermann
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-01 00:00:00.000000000 Z
11
+ date: 2022-06-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oj
@@ -381,13 +381,7 @@ licenses:
381
381
  metadata:
382
382
  github_repo: ssh://github.com/isomorfeus/gems
383
383
  source_code_uri: https://github.com/isomorfeus/isomorfeus-ferret
384
- post_install_message: |2+
385
-
386
- isomorfeus-ferret 0.13:
387
- Breaking change:
388
- To support Ruby string encodings, die index file format has changed.
389
- Indexes created with previous versions < 0.13 must be recreated with 0.13!
390
-
384
+ post_install_message:
391
385
  rdoc_options: []
392
386
  require_paths:
393
387
  - lib
@@ -407,4 +401,3 @@ signing_key:
407
401
  specification_version: 4
408
402
  summary: Indexed document store for Isomorfeus.
409
403
  test_files: []
410
- ...