ferret 0.11.3 → 0.11.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/q_span.c CHANGED
@@ -2340,7 +2340,7 @@ static Query *spanprq_rewrite(Query *self, IndexReader *ir)
2340
2340
  {
2341
2341
  const char *field = SpQ(self)->field;
2342
2342
  const int field_num = fis_get_field_num(ir->fis, field);
2343
- Query *volatile q = spanmtq_new_conf(field, SPAN_PREFIX_QUERY_MAX_TERMS);
2343
+ Query *volatile q = spanmtq_new_conf(field, SpPfxQ(self)->max_terms);
2344
2344
  q->boost = self->boost; /* set the boost */
2345
2345
 
2346
2346
  if (field_num >= 0) {
@@ -2388,6 +2388,7 @@ Query *spanprq_new(const char *field, const char *prefix)
2388
2388
 
2389
2389
  SpQ(self)->field = estrdup(field);
2390
2390
  SpPfxQ(self)->prefix = estrdup(prefix);
2391
+ SpPfxQ(self)->max_terms = SPAN_PREFIX_QUERY_MAX_TERMS;
2391
2392
 
2392
2393
  self->type = SPAN_PREFIX_QUERY;
2393
2394
  self->rewrite = &spanprq_rewrite;
data/ext/r_analysis.c CHANGED
@@ -560,7 +560,6 @@ static TokenStream *
560
560
  cwrts_reset(TokenStream *ts, char *text)
561
561
  {
562
562
  ts->t = ts->text = text;
563
- Xj
564
563
  rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
565
564
  return ts;
566
565
  }
@@ -820,7 +819,9 @@ static VALUE
820
819
  frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
821
820
  {
822
821
  TS_ARGS(false);
822
+ #ifndef POSH_OS_WIN32
823
823
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
824
+ #endif
824
825
  return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
825
826
  }
826
827
 
@@ -849,7 +850,9 @@ static VALUE
849
850
  frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
850
851
  {
851
852
  TS_ARGS(false);
853
+ #ifndef POSH_OS_WIN32
852
854
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
855
+ #endif
853
856
  return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
854
857
  }
855
858
 
@@ -877,7 +880,9 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
877
880
  static VALUE
878
881
  frt_standard_tokenizer_init(VALUE self, VALUE rstr)
879
882
  {
883
+ #ifndef POSH_OS_WIN32
880
884
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
885
+ #endif
881
886
  return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
882
887
  }
883
888
 
@@ -917,7 +922,9 @@ static VALUE
917
922
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
918
923
  {
919
924
  TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
925
+ #ifndef POSH_OS_WIN32
920
926
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
927
+ #endif
921
928
  ts = mb_lowercase_filter_new(ts);
922
929
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
923
930
 
@@ -1257,7 +1264,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1257
1264
  {
1258
1265
  Analyzer *a;
1259
1266
  GET_LOWER(false);
1267
+ #ifndef POSH_OS_WIN32
1260
1268
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1269
+ #endif
1261
1270
  a = mb_whitespace_analyzer_new(lower);
1262
1271
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1263
1272
  object_add(a, self);
@@ -1300,7 +1309,9 @@ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1300
1309
  {
1301
1310
  Analyzer *a;
1302
1311
  GET_LOWER(true);
1312
+ #ifndef POSH_OS_WIN32
1303
1313
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1314
+ #endif
1304
1315
  a = mb_letter_analyzer_new(lower);
1305
1316
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1306
1317
  object_add(a, self);
@@ -1372,7 +1383,9 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1372
1383
  bool lower;
1373
1384
  VALUE rlower, rstop_words;
1374
1385
  Analyzer *a;
1386
+ #ifndef POSH_OS_WIN32
1375
1387
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1388
+ #endif
1376
1389
  rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1377
1390
  lower = ((rlower == Qnil) ? true : RTEST(rlower));
1378
1391
  if (rstop_words != Qnil) {
data/ext/r_index.c CHANGED
@@ -196,6 +196,19 @@ frt_fi_init(int argc, VALUE *argv, VALUE self)
196
196
  return self;
197
197
  }
198
198
 
199
+ /*
200
+ * call-seq:
201
+ * fi.name -> symbol
202
+ *
203
+ * Return the name of the field
204
+ */
205
+ static VALUE
206
+ frt_fi_name(VALUE self)
207
+ {
208
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
209
+ return ID2SYM(rb_intern(fi->name));
210
+ }
211
+
199
212
  /*
200
213
  * call-seq:
201
214
  * fi.stored? -> bool
@@ -800,6 +813,86 @@ frt_te_set_field(VALUE self, VALUE rfield)
800
813
  return self;
801
814
  }
802
815
 
816
+ /*
817
+ * call-seq:
818
+ * term_enum.to_json() -> string
819
+ *
820
+ * Returns a JSON representation of the term enum. You can speed this up by
821
+ * having the method return arrays instead of objects, simply by passing an
822
+ * argument to the to_json method. For example;
823
+ *
824
+ * term_enum.to_json() #=>
825
+ * # [
826
+ * # {"term":"apple","frequency":12},
827
+ * # {"term":"banana","frequency":2},
828
+ * # {"term":"cantaloupe","frequency":12}
829
+ * # ]
830
+ *
831
+ * term_enum.to_json(:fast) #=>
832
+ * # [
833
+ * # ["apple",12],
834
+ * # ["banana",2],
835
+ * # ["cantaloupe",12]
836
+ * # ]
837
+ */
838
+ static VALUE
839
+ frt_te_to_json(int argc, VALUE *argv, VALUE self)
840
+ {
841
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
842
+ VALUE rjson;
843
+ char *json, *jp;
844
+ char *term;
845
+ int capa = 65536;
846
+ jp = json = ALLOC_N(char, capa);
847
+ *(jp++) = '[';
848
+
849
+ if (argc > 0) {
850
+ while (NULL != (term = te->next(te))) {
851
+ /* enough room for for term after converting " to '"' and frequency
852
+ * plus some extra for good measure */
853
+ *(jp++) = '[';
854
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
855
+ capa <<= 1;
856
+ REALLOC_N(json, char, capa);
857
+ }
858
+ jp = json_concat_string(jp, term);
859
+ *(jp++) = ',';
860
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
861
+ jp += strlen(jp);
862
+ *(jp++) = ']';
863
+ *(jp++) = ',';
864
+ }
865
+ }
866
+ else {
867
+ while (NULL != (term = te->next(te))) {
868
+ /* enough room for for term after converting " to '"' and frequency
869
+ * plus some extra for good measure */
870
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
871
+ capa <<= 1;
872
+ REALLOC_N(json, char, capa);
873
+ }
874
+ *(jp++) = '{';
875
+ memcpy(jp, "\"term\":", 7);
876
+ jp += 7;
877
+ jp = json_concat_string(jp, term);
878
+ *(jp++) = ',';
879
+ memcpy(jp, "\"frequency\":", 12);
880
+ jp += 12;
881
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
882
+ jp += strlen(jp);
883
+ *(jp++) = '}';
884
+ *(jp++) = ',';
885
+ }
886
+ }
887
+ if (*(jp-1) == ',') jp--;
888
+ *(jp++) = ']';
889
+ *jp = '\0';
890
+
891
+ rjson = rb_str_new2(json);
892
+ free(json);
893
+ return rjson;
894
+ }
895
+
803
896
  /****************************************************************************
804
897
  *
805
898
  * TermDocEnum Methods
@@ -960,6 +1053,89 @@ frt_tde_each(VALUE self)
960
1053
  return INT2FIX(doc_cnt);
961
1054
  }
962
1055
 
1056
+ /*
1057
+ * call-seq:
1058
+ * term_doc_enum.to_json() -> string
1059
+ *
1060
+ * Returns a json representation of the term doc enum. It will also add the
1061
+ * term positions if they are available. You can speed this up by having the
1062
+ * method return arrays instead of objects, simply by passing an argument to
1063
+ * the to_json method. For example;
1064
+ *
1065
+ * term_doc_enum.to_json() #=>
1066
+ * # [
1067
+ * # {"document":1,"frequency":12},
1068
+ * # {"document":11,"frequency":1},
1069
+ * # {"document":29,"frequency":120},
1070
+ * # {"document":30,"frequency":3}
1071
+ * # ]
1072
+ *
1073
+ * term_doc_enum.to_json(:fast) #=>
1074
+ * # [
1075
+ * # [1,12],
1076
+ * # [11,1],
1077
+ * # [29,120],
1078
+ * # [30,3]
1079
+ * # ]
1080
+ */
1081
+ static VALUE
1082
+ frt_tde_to_json(int argc, VALUE *argv, VALUE self)
1083
+ {
1084
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1085
+ VALUE rjson;
1086
+ char *json, *jp;
1087
+ int capa = 65536;
1088
+ char *format;
1089
+ char close = (argc > 0) ? ']' : '}';
1090
+ bool do_positions = tde->next_position != NULL;
1091
+ jp = json = ALLOC_N(char, capa);
1092
+ *(jp++) = '[';
1093
+
1094
+ if (do_positions) {
1095
+ if (argc == 0) {
1096
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1097
+ }
1098
+ else {
1099
+ format = "[%d,%d,[";
1100
+ }
1101
+ }
1102
+ else {
1103
+ if (argc == 0) {
1104
+ format = "{\"document\":%d,\"frequency\":%d},";
1105
+ }
1106
+ else {
1107
+ format = "[%d,%d],";
1108
+ }
1109
+ }
1110
+ while (tde->next(tde)) {
1111
+ /* 100 chars should be enough room for an extra entry */
1112
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1113
+ capa <<= 1;
1114
+ REALLOC_N(json, char, capa);
1115
+ }
1116
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1117
+ jp += strlen(jp);
1118
+ if (do_positions) {
1119
+ int pos;
1120
+ while (0 <= (pos = tde->next_position(tde))) {
1121
+ sprintf(jp, "%d,", pos);
1122
+ jp += strlen(jp);
1123
+ }
1124
+ if (*(jp - 1) == ',') jp--;
1125
+ *(jp++) = ']';
1126
+ *(jp++) = close;
1127
+ *(jp++) = ',';
1128
+ }
1129
+ }
1130
+ if (*(jp - 1) == ',') jp--;
1131
+ *(jp++) = ']';
1132
+ *jp = '\0';
1133
+
1134
+ rjson = rb_str_new2(json);
1135
+ free(json);
1136
+ return rjson;
1137
+ }
1138
+
963
1139
  /*
964
1140
  * call-seq:
965
1141
  * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
@@ -2678,6 +2854,7 @@ Init_FieldInfo(void)
2678
2854
  rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2679
2855
 
2680
2856
  rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2857
+ rb_define_method(cFieldInfo, "name", frt_fi_name, 0);
2681
2858
  rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2682
2859
  rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2683
2860
  rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
@@ -2793,6 +2970,7 @@ Init_TermEnum(void)
2793
2970
  rb_define_method(cTermEnum, "each", frt_te_each, 0);
2794
2971
  rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2795
2972
  rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2973
+ rb_define_method(cTermEnum, "to_json", frt_te_to_json, -1);
2796
2974
  }
2797
2975
 
2798
2976
  /*
@@ -2844,6 +3022,7 @@ Init_TermDocEnum(void)
2844
3022
  rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
2845
3023
  rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
2846
3024
  rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
3025
+ rb_define_method(cTermDocEnum, "to_json", frt_tde_to_json, -1);
2847
3026
  }
2848
3027
 
2849
3028
  /* rdochack
data/ext/r_search.c CHANGED
@@ -224,32 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
224
224
  return rstr;
225
225
  }
226
226
 
227
- /*
228
- * Json Exportation - Loading each LazyDoc and formatting them into json
229
- * This code is designed to get a VERY FAST json string, the goal was speed,
230
- * not sexyness.
231
- * Jeremie 'ahFeel' BORDIER
232
- * ahFeel@rift.Fr
233
- */
234
227
  __inline char *
235
- json_concat_string(char *s, char *field)
236
- {
237
- *(s++) = '"';
238
- while (*field) {
239
- if (*field == '\"') {
240
- *(s++) = '\'';
241
- *(s++) = *(field++);
242
- *(s++) = '\'';
243
- }
244
- else {
245
- *(s++) = *(field++);
246
- }
247
- }
248
- *(s++) = '"';
249
- return s;
250
- }
251
-
252
- inline char *
253
228
  frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
254
229
  {
255
230
  int i, j;
@@ -260,7 +235,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
260
235
  for (i = 0; i < lzd->size; i++) {
261
236
  f = lzd->fields[i];
262
237
  /* 3 times length of field to make space for quoted quotes ('"') and
263
- * 4 x field length to make space for '"' around fields and ','
238
+ * 4 times field elements to make space for '"' around fields and ','
264
239
  * between fields. Add 100 for '[', ']' and good safety.
265
240
  */
266
241
  len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
@@ -1632,15 +1607,22 @@ frt_spanmtq_init(VALUE self, VALUE rfield, VALUE rterms)
1632
1607
 
1633
1608
  /*
1634
1609
  * call-seq:
1635
- * SpanPrefixQuery.new(field, prefix) -> query
1610
+ * SpanPrefixQuery.new(field, prefix, max_terms = 256) -> query
1636
1611
  *
1637
1612
  * Create a new SpanPrefixQuery which matches all documents with the prefix
1638
1613
  * +prefix+ in the field +field+.
1639
1614
  */
1640
1615
  static VALUE
1641
- frt_spanprq_init(VALUE self, VALUE rfield, VALUE rprefix)
1616
+ frt_spanprq_init(int argc, VALUE *argv, VALUE self)
1642
1617
  {
1643
- Query *q = spanprq_new(frt_field(rfield), StringValuePtr(rprefix));
1618
+ VALUE rfield, rprefix, rmax_terms;
1619
+ int max_terms = SPAN_PREFIX_QUERY_MAX_TERMS;
1620
+ Query *q;
1621
+ if (rb_scan_args(argc, argv, "21", &rfield, &rprefix, &rmax_terms) == 3) {
1622
+ max_terms = FIX2INT(rmax_terms);
1623
+ }
1624
+ q = spanprq_new(frt_field(rfield), StringValuePtr(rprefix));
1625
+ ((SpanPrefixQuery *)q)->max_terms = max_terms;
1644
1626
  Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
1645
1627
  object_add(q, self);
1646
1628
  return self;
@@ -3556,7 +3538,7 @@ Init_SpanPrefixQuery(void)
3556
3538
  cSpanPrefixQuery = rb_define_class_under(mSpans, "SpanPrefixQuery", cQuery);
3557
3539
  rb_define_alloc_func(cSpanPrefixQuery, frt_data_alloc);
3558
3540
 
3559
- rb_define_method(cSpanPrefixQuery, "initialize", frt_spanprq_init, 2);
3541
+ rb_define_method(cSpanPrefixQuery, "initialize", frt_spanprq_init, -1);
3560
3542
  }
3561
3543
 
3562
3544
  /*
data/ext/search.c CHANGED
@@ -1041,6 +1041,7 @@ static TopDocs *isea_search_w(Searcher *self,
1041
1041
 
1042
1042
  scorer = weight->scorer(weight, ISEA(self)->ir);
1043
1043
  if (!scorer || 0 == ISEA(self)->ir->num_docs(ISEA(self)->ir)) {
1044
+ if (scorer) scorer->destroy(scorer);
1044
1045
  return td_new(0, 0, NULL, 0.0);
1045
1046
  }
1046
1047
 
data/ext/search.h CHANGED
@@ -285,6 +285,7 @@ typedef struct BooleanQuery
285
285
  } BooleanQuery;
286
286
 
287
287
  extern Query *bq_new(bool coord_disabled);
288
+ extern Query *bq_new_max(bool coord_disabled, int max);
288
289
  extern BooleanClause *bq_add_query(Query *self, Query *sub_query,
289
290
  enum BC_TYPE occur);
290
291
  extern BooleanClause *bq_add_query_nr(Query *self, Query *sub_query,
@@ -571,6 +572,7 @@ typedef struct SpanPrefixQuery
571
572
  {
572
573
  SpanQuery super;
573
574
  char *prefix;
575
+ int max_terms;
574
576
  } SpanPrefixQuery;
575
577
 
576
578
  extern Query *spanprq_new(const char *field, const char *prefix);
@@ -868,6 +870,8 @@ typedef struct QParser
868
870
  bool handle_parse_errors : 1;
869
871
  bool allow_any_fields : 1;
870
872
  bool close_def_fields : 1;
873
+ bool destruct : 1;
874
+ bool recovering : 1;
871
875
  } QParser;
872
876
 
873
877
  extern QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
data/ext/store.c CHANGED
@@ -443,6 +443,30 @@ char *is_read_string(InStream *is)
443
443
  return str;
444
444
  }
445
445
 
446
+ char *is_read_string_safe(InStream *is)
447
+ {
448
+ register int length = (int) is_read_vint(is);
449
+ char *str = ALLOC_N(char, length + 1);
450
+ str[length] = '\0';
451
+
452
+ TRY
453
+ if (is->buf.pos > (is->buf.len - length)) {
454
+ register int i;
455
+ for (i = 0; i < length; i++) {
456
+ str[i] = is_read_byte(is);
457
+ }
458
+ }
459
+ else { /* unchecked optimization */
460
+ memcpy(str, is->buf.buf + is->buf.pos, length);
461
+ is->buf.pos += length;
462
+ }
463
+ XCATCHALL
464
+ free(str);
465
+ XENDTRY
466
+
467
+ return str;
468
+ }
469
+
446
470
  void os_write_i32(OutStream *os, f_i32 num)
447
471
  {
448
472
  os_write_byte(os, (uchar)((num >> 24) & 0xFF));