isomorfeus-ferret 0.13.1 → 0.13.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 98439b4a9e6ca849246c6e2ddd2ce1bbf117182025b3651d4f6a95593aff0eb6
4
- data.tar.gz: 0e1e90c4bfce1014c9983f4bb5be0f3123ef502788f464e988c4671bdd1758ed
3
+ metadata.gz: a1a8509c12f5d180d38944adb53291958a9f8327a8b1706def9fecd7f9c60e73
4
+ data.tar.gz: fbbd38e08dd1992cd93663b1b04cc4b666b6a6dc59104414b75515bc2ec4d54d
5
5
  SHA512:
6
- metadata.gz: da674772ba34175364d0d4d93023ef380eab06334b4f9c3e1956934289a7a2016387607740fb83cc11753a3bb72de62208390933ff9f4cc341524b7fe6c0c6af
7
- data.tar.gz: 8261029020f33cb9fb52453e007defdfd2c1dd029f9da22a02b64ae0047bcd333e62e6d6a8d15263ee71089e203f393f5e865138a31784edf48f66a483a0ffce
6
+ metadata.gz: 7158cdc3f7a0624d35b668b31287d489ee3d389f17a2ab1b58235a7c2be639b6a7bbb3a8c0efee6d168804128fa5c0d7955c00f860b653a76f23d6421fc84c5d
7
+ data.tar.gz: 60c62db42081291a52e66be4b7e1b4c46eb12a458b0838f3d8793ffaeb013aff27a37aca06ab4831157d1dd65330395bfdaa2fa7924a278261f1187295592042
@@ -64,6 +64,7 @@ static ID id_fld_num_map;
64
64
  static ID id_field_num;
65
65
  static ID id_boost;
66
66
 
67
+ extern rb_encoding *utf8_encoding;
67
68
  extern void frb_set_term(VALUE rterm, FrtTerm *t);
68
69
  extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
69
70
  extern VALUE frb_get_analyzer(FrtAnalyzer *a);
@@ -1219,6 +1220,7 @@ static VALUE frb_get_tv_term(FrtTVTerm *tv_term) {
1219
1220
  VALUE rtext;
1220
1221
  VALUE rpositions = Qnil;
1221
1222
  rtext = rb_str_new2(tv_term->text);
1223
+ rb_enc_associate(rtext, utf8_encoding);
1222
1224
  if (tv_term->positions) {
1223
1225
  int *positions = tv_term->positions;
1224
1226
  rpositions = rb_ary_new2(freq);
@@ -3085,10 +3087,6 @@ static void Init_TermDocEnum(void) {
3085
3087
  rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
3086
3088
  }
3087
3089
 
3088
- /* rdochack
3089
- cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3090
- */
3091
-
3092
3090
  /*
3093
3091
  * Document-class: Ferret::Index::TermVector::TVOffsets
3094
3092
  *
@@ -3107,9 +3105,6 @@ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3107
3105
  */
3108
3106
  static void Init_TVOffsets(void) {
3109
3107
  const char *tv_offsets_class = "TVOffsets";
3110
- /* rdochack
3111
- cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3112
- */
3113
3108
  cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3114
3109
  rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3115
3110
  rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
@@ -3130,13 +3125,8 @@ static void Init_TVOffsets(void) {
3130
3125
  * tv_term = tv.find {|tvt| tvt.term = "fox"}
3131
3126
  * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3132
3127
  */
3133
- static void
3134
- Init_TVTerm(void)
3135
- {
3128
+ static void Init_TVTerm(void) {
3136
3129
  const char *tv_term_class = "TVTerm";
3137
- /* rdochack
3138
- cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3139
- */
3140
3130
  cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
3141
3131
  rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3142
3132
  rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
@@ -3172,15 +3162,9 @@ Init_TVTerm(void)
3172
3162
  * particular that you need to store both positions and offsets if you want
3173
3163
  * to associate offsets with particular terms.
3174
3164
  */
3175
- static void
3176
- Init_TermVector(void)
3177
- {
3165
+ static void Init_TermVector(void) {
3178
3166
  const char *tv_class = "TermVector";
3179
- /* rdochack
3180
- cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3181
- */
3182
- cTermVector = rb_struct_define(tv_class,
3183
- "field", "terms", "offsets", NULL);
3167
+ cTermVector = rb_struct_define(tv_class, "field", "terms", "offsets", NULL);
3184
3168
  rb_set_class_path(cTermVector, mIndex, tv_class);
3185
3169
  rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3186
3170
 
@@ -14,6 +14,7 @@
14
14
  #undef close
15
15
  #undef read
16
16
 
17
+ extern rb_encoding *utf8_encoding;
17
18
  extern void frt_micro_sleep(const int micro_seconds);
18
19
 
19
20
  #define GET_LOCK(lock, name, store, err_msg) do {\
@@ -1710,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
1710
1711
  total_len = delta_start + delta_len;
1711
1712
  frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
1712
1713
  buffer[total_len++] = '\0';
1713
- term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
1714
- buffer, total_len);
1714
+ term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
1715
1715
 
1716
1716
  /* read freq */
1717
1717
  freq = term->freq = frt_is_read_vint(fdt_in);
@@ -1822,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
1822
1822
  *
1823
1823
  ****************************************************************************/
1824
1824
 
1825
- FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
1826
- {
1825
+ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
1827
1826
  FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
1828
1827
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
1829
1828
  size_t segment_len = strlen(segment);
@@ -1844,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
1844
1843
  return fw;
1845
1844
  }
1846
1845
 
1847
- void frt_fw_close(FrtFieldsWriter *fw)
1848
- {
1846
+ void frt_fw_close(FrtFieldsWriter *fw) {
1849
1847
  frt_os_close(fw->fdt_out);
1850
1848
  frt_os_close(fw->fdx_out);
1851
1849
  frt_ram_destroy_buffer(fw->buffer);
@@ -2046,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
2046
2044
  frt_ramo_write_to(fw->buffer, fdt_out);
2047
2045
  }
2048
2046
 
2049
- void frt_fw_write_tv_index(FrtFieldsWriter *fw)
2050
- {
2047
+ void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
2051
2048
  int i;
2052
2049
  const int tv_cnt = frt_ary_size(fw->tv_fields);
2053
2050
  FrtOutStream *fdt_out = fw->fdt_out;
@@ -5548,9 +5545,24 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDoc
5548
5545
  for (i = 0; i < df_size; i++) {
5549
5546
  int len = df->lengths[i];
5550
5547
  char *data_ptr = df->data[i];
5551
- if (len > FRT_MAX_WORD_SIZE) {
5552
- len = FRT_MAX_WORD_SIZE - 1;
5553
- data_ptr = (char *)memcpy(buf, df->data[i], len);
5548
+ if (df->encodings[i] == utf8_encoding) {
5549
+ if (len >= FRT_MAX_WORD_SIZE) {
5550
+ len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
5551
+ data_ptr = (char *)memcpy(buf, df->data[i], len);
5552
+ buf[len] = '\0';
5553
+ }
5554
+ } else if (df->encodings[i] != utf8_encoding) {
5555
+ if (len >= FRT_MAX_WORD_SIZE)
5556
+ len = FRT_MAX_WORD_SIZE - 1;
5557
+ const unsigned char *sp = (unsigned char *)df->data[i];
5558
+ unsigned char *dp = (unsigned char *)&buf;
5559
+ rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
5560
+ assert(ec != NULL);
5561
+ rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
5562
+ rb_econv_close(ec);
5563
+ len = dp - (unsigned char *)&buf;
5564
+ buf[len] = '\0';
5565
+ data_ptr = buf;
5554
5566
  }
5555
5567
  dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
5556
5568
  if (store_offsets) {
@@ -1,5 +1,5 @@
1
1
  module Isomorfeus
2
2
  module Ferret
3
- VERSION = '0.13.1'
3
+ VERSION = '0.13.2'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isomorfeus-ferret
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.1
4
+ version: 0.13.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Biedermann
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-04-16 00:00:00.000000000 Z
11
+ date: 2022-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake