isomorfeus-ferret 0.13.1 → 0.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 98439b4a9e6ca849246c6e2ddd2ce1bbf117182025b3651d4f6a95593aff0eb6
4
- data.tar.gz: 0e1e90c4bfce1014c9983f4bb5be0f3123ef502788f464e988c4671bdd1758ed
3
+ metadata.gz: a1a8509c12f5d180d38944adb53291958a9f8327a8b1706def9fecd7f9c60e73
4
+ data.tar.gz: fbbd38e08dd1992cd93663b1b04cc4b666b6a6dc59104414b75515bc2ec4d54d
5
5
  SHA512:
6
- metadata.gz: da674772ba34175364d0d4d93023ef380eab06334b4f9c3e1956934289a7a2016387607740fb83cc11753a3bb72de62208390933ff9f4cc341524b7fe6c0c6af
7
- data.tar.gz: 8261029020f33cb9fb52453e007defdfd2c1dd029f9da22a02b64ae0047bcd333e62e6d6a8d15263ee71089e203f393f5e865138a31784edf48f66a483a0ffce
6
+ metadata.gz: 7158cdc3f7a0624d35b668b31287d489ee3d389f17a2ab1b58235a7c2be639b6a7bbb3a8c0efee6d168804128fa5c0d7955c00f860b653a76f23d6421fc84c5d
7
+ data.tar.gz: 60c62db42081291a52e66be4b7e1b4c46eb12a458b0838f3d8793ffaeb013aff27a37aca06ab4831157d1dd65330395bfdaa2fa7924a278261f1187295592042
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
64
64
  static ID id_field_num;
65
65
  static ID id_boost;
66
66
 
67
+ extern rb_encoding *utf8_encoding;
67
68
  extern void frb_set_term(VALUE rterm, FrtTerm *t);
68
69
  extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
69
70
  extern VALUE frb_get_analyzer(FrtAnalyzer *a);
@@ -1219,6 +1220,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
1219
1220
  VALUE rtext;
1220
1221
  VALUE rpositions = Qnil;
1221
1222
  rtext = rb_str_new2(tv_term->text);
1223
+ rb_enc_associate(rtext, utf8_encoding);
1222
1224
  if (tv_term->positions) {
1223
1225
  int *positions = tv_term->positions;
1224
1226
  rpositions = rb_ary_new2(freq);
@@ -3085,10 +3087,6 @@ static void Init_TermDocEnum(void) {
3085
3087
  rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
3086
3088
  }
3087
3089
 
3088
- /* rdochack
3089
- cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3090
- */
3091
-
3092
3090
  /*
3093
3091
  * Document-class: Ferret::Index::TermVector::TVOffsets
3094
3092
  *
@@ -3107,9 +3105,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3107
3105
  */
3108
3106
  static void Init_TVOffsets(void) {
3109
3107
  const char *tv_offsets_class = "TVOffsets";
3110
- /* rdochack
3111
- cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3112
- */
3113
3108
  cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3114
3109
  rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3115
3110
  rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
@@ -3130,13 +3125,8 @@ static void Init_TVOffsets(void) {
3130
3125
  * tv_term = tv.find {|tvt| tvt.term = "fox"}
3131
3126
  * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3132
3127
  */
3133
- static void
3134
- Init_TVTerm(void)
3135
- {
3128
+ static void Init_TVTerm(void) {
3136
3129
  const char *tv_term_class = "TVTerm";
3137
- /* rdochack
3138
- cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3139
- */
3140
3130
  cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
3141
3131
  rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3142
3132
  rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
@@ -3172,15 +3162,9 @@ Init_TVTerm(void)
3172
3162
  * particular that you need to store both positions and offsets if you want
3173
3163
  * to associate offsets with particular terms.
3174
3164
  */
3175
- static void
3176
- Init_TermVector(void)
3177
- {
3165
+ static void Init_TermVector(void) {
3178
3166
  const char *tv_class = "TermVector";
3179
- /* rdochack
3180
- cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3181
- */
3182
- cTermVector = rb_struct_define(tv_class,
3183
- "field", "terms", "offsets", NULL);
3167
+ cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
3184
3168
  rb_set_class_path(cTermVector, mIndex, tv_class);
3185
3169
  rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3186
3170
 
@@ -14,6 +14,7 @@
14
14
  #undef close
15
15
  #undef read
16
16
 
17
+ extern rb_encoding *utf8_encoding;
17
18
  extern void frt_micro_sleep(const int micro_seconds);
18
19
 
19
20
  #define GET_LOCK(lock, name, store, err_msg) do {\
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
1710
1711
  total_len = delta_start + delta_len;
1711
1712
  frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
1712
1713
  buffer[total_len++] = '\0';
1713
- term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
1714
- buffer, total_len);
1714
+ term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
1715
1715
 
1716
1716
  /* read freq */
1717
1717
  freq = term->freq = frt_is_read_vint(fdt_in);
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
1822
1822
  *
1823
1823
  ****************************************************************************/
1824
1824
 
1825
- FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
1826
- {
1825
+ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
1827
1826
  FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
1828
1827
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
1829
1828
  size_t segment_len = strlen(segment);
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
1844
1843
  return fw;
1845
1844
  }
1846
1845
 
1847
- void frt_fw_close(FrtFieldsWriter *fw)
1848
- {
1846
+ void frt_fw_close(FrtFieldsWriter *fw) {
1849
1847
  frt_os_close(fw->fdt_out);
1850
1848
  frt_os_close(fw->fdx_out);
1851
1849
  frt_ram_destroy_buffer(fw->buffer);
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
2046
2044
  frt_ramo_write_to(fw->buffer, fdt_out);
2047
2045
  }
2048
2046
 
2049
- void frt_fw_write_tv_index(FrtFieldsWriter *fw)
2050
- {
2047
+ void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
2051
2048
  int i;
2052
2049
  const int tv_cnt = frt_ary_size(fw->tv_fields);
2053
2050
  FrtOutStream *fdt_out = fw->fdt_out;
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
5548
5545
  for (i = 0; i < df_size; i++) {
5549
5546
  int len = df->lengths[i];
5550
5547
  char *data_ptr = df->data[i];
5551
- if (len > FRT_MAX_WORD_SIZE) {
5552
- len = FRT_MAX_WORD_SIZE - 1;
5553
- data_ptr = (char *)memcpy(buf, df->data[i], len);
5548
+ if (df->encodings[i] == utf8_encoding) {
5549
+ if (len >= FRT_MAX_WORD_SIZE) {
5550
+ len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
5551
+ data_ptr = (char *)memcpy(buf, df->data[i], len);
5552
+ buf[len] = '\0';
5553
+ }
5554
+ } else if (df->encodings[i] != utf8_encoding) {
5555
+ if (len >= FRT_MAX_WORD_SIZE)
5556
+ len = FRT_MAX_WORD_SIZE - 1;
5557
+ const unsigned char *sp = (unsigned char *)df->data[i];
5558
+ unsigned char *dp = (unsigned char *)&buf;
5559
+ rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
5560
+ assert(ec != NULL);
5561
+ rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
5562
+ rb_econv_close(ec);
5563
+ len = dp - (unsigned char *)&buf;
5564
+ buf[len] = '\0';
5565
+ data_ptr = buf;
5554
5566
  }
5555
5567
  dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
5556
5568
  if (store_offsets) {
@@ -1,5 +1,5 @@
1
1
  module Isomorfeus
2
2
  module Ferret
3
- VERSION = '0.13.1'
3
+ VERSION = '0.13.2'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isomorfeus-ferret
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.1
4
+ version: 0.13.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Biedermann
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-04-16 00:00:00.000000000 Z
11
+ date: 2022-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake