ferret 0.11.3 → 0.11.4

Sign up to get free protection for your applications and to get access to all the features.
data/ext/q_span.c CHANGED
@@ -2340,7 +2340,7 @@ static Query *spanprq_rewrite(Query *self, IndexReader *ir)
2340
2340
  {
2341
2341
  const char *field = SpQ(self)->field;
2342
2342
  const int field_num = fis_get_field_num(ir->fis, field);
2343
- Query *volatile q = spanmtq_new_conf(field, SPAN_PREFIX_QUERY_MAX_TERMS);
2343
+ Query *volatile q = spanmtq_new_conf(field, SpPfxQ(self)->max_terms);
2344
2344
  q->boost = self->boost; /* set the boost */
2345
2345
 
2346
2346
  if (field_num >= 0) {
@@ -2388,6 +2388,7 @@ Query *spanprq_new(const char *field, const char *prefix)
2388
2388
 
2389
2389
  SpQ(self)->field = estrdup(field);
2390
2390
  SpPfxQ(self)->prefix = estrdup(prefix);
2391
+ SpPfxQ(self)->max_terms = SPAN_PREFIX_QUERY_MAX_TERMS;
2391
2392
 
2392
2393
  self->type = SPAN_PREFIX_QUERY;
2393
2394
  self->rewrite = &spanprq_rewrite;
data/ext/r_analysis.c CHANGED
@@ -560,7 +560,6 @@ static TokenStream *
560
560
  cwrts_reset(TokenStream *ts, char *text)
561
561
  {
562
562
  ts->t = ts->text = text;
563
- Xj
564
563
  rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
565
564
  return ts;
566
565
  }
@@ -820,7 +819,9 @@ static VALUE
820
819
  frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
821
820
  {
822
821
  TS_ARGS(false);
822
+ #ifndef POSH_OS_WIN32
823
823
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
824
+ #endif
824
825
  return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
825
826
  }
826
827
 
@@ -849,7 +850,9 @@ static VALUE
849
850
  frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
850
851
  {
851
852
  TS_ARGS(false);
853
+ #ifndef POSH_OS_WIN32
852
854
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
855
+ #endif
853
856
  return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
854
857
  }
855
858
 
@@ -877,7 +880,9 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
877
880
  static VALUE
878
881
  frt_standard_tokenizer_init(VALUE self, VALUE rstr)
879
882
  {
883
+ #ifndef POSH_OS_WIN32
880
884
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
885
+ #endif
881
886
  return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
882
887
  }
883
888
 
@@ -917,7 +922,9 @@ static VALUE
917
922
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
918
923
  {
919
924
  TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
925
+ #ifndef POSH_OS_WIN32
920
926
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
927
+ #endif
921
928
  ts = mb_lowercase_filter_new(ts);
922
929
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
923
930
 
@@ -1257,7 +1264,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1257
1264
  {
1258
1265
  Analyzer *a;
1259
1266
  GET_LOWER(false);
1267
+ #ifndef POSH_OS_WIN32
1260
1268
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1269
+ #endif
1261
1270
  a = mb_whitespace_analyzer_new(lower);
1262
1271
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1263
1272
  object_add(a, self);
@@ -1300,7 +1309,9 @@ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1300
1309
  {
1301
1310
  Analyzer *a;
1302
1311
  GET_LOWER(true);
1312
+ #ifndef POSH_OS_WIN32
1303
1313
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1314
+ #endif
1304
1315
  a = mb_letter_analyzer_new(lower);
1305
1316
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1306
1317
  object_add(a, self);
@@ -1372,7 +1383,9 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1372
1383
  bool lower;
1373
1384
  VALUE rlower, rstop_words;
1374
1385
  Analyzer *a;
1386
+ #ifndef POSH_OS_WIN32
1375
1387
  if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1388
+ #endif
1376
1389
  rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1377
1390
  lower = ((rlower == Qnil) ? true : RTEST(rlower));
1378
1391
  if (rstop_words != Qnil) {
data/ext/r_index.c CHANGED
@@ -196,6 +196,19 @@ frt_fi_init(int argc, VALUE *argv, VALUE self)
196
196
  return self;
197
197
  }
198
198
 
199
+ /*
200
+ * call-seq:
201
+ * fi.name -> symbol
202
+ *
203
+ * Return the name of the field
204
+ */
205
+ static VALUE
206
+ frt_fi_name(VALUE self)
207
+ {
208
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
209
+ return ID2SYM(rb_intern(fi->name));
210
+ }
211
+
199
212
  /*
200
213
  * call-seq:
201
214
  * fi.stored? -> bool
@@ -800,6 +813,86 @@ frt_te_set_field(VALUE self, VALUE rfield)
800
813
  return self;
801
814
  }
802
815
 
816
+ /*
817
+ * call-seq:
818
+ * term_enum.to_json() -> string
819
+ *
820
+ * Returns a JSON representation of the term enum. You can speed this up by
821
+ * having the method return arrays instead of objects, simply by passing an
822
+ * argument to the to_json method. For example;
823
+ *
824
+ * term_enum.to_json() #=>
825
+ * # [
826
+ * # {"term":"apple","frequency":12},
827
+ * # {"term":"banana","frequency":2},
828
+ * # {"term":"cantaloupe","frequency":12}
829
+ * # ]
830
+ *
831
+ * term_enum.to_json(:fast) #=>
832
+ * # [
833
+ * # ["apple",12],
834
+ * # ["banana",2],
835
+ * # ["cantaloupe",12]
836
+ * # ]
837
+ */
838
+ static VALUE
839
+ frt_te_to_json(int argc, VALUE *argv, VALUE self)
840
+ {
841
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
842
+ VALUE rjson;
843
+ char *json, *jp;
844
+ char *term;
845
+ int capa = 65536;
846
+ jp = json = ALLOC_N(char, capa);
847
+ *(jp++) = '[';
848
+
849
+ if (argc > 0) {
850
+ while (NULL != (term = te->next(te))) {
851
+ /* enough room for for term after converting " to '"' and frequency
852
+ * plus some extra for good measure */
853
+ *(jp++) = '[';
854
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
855
+ capa <<= 1;
856
+ REALLOC_N(json, char, capa);
857
+ }
858
+ jp = json_concat_string(jp, term);
859
+ *(jp++) = ',';
860
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
861
+ jp += strlen(jp);
862
+ *(jp++) = ']';
863
+ *(jp++) = ',';
864
+ }
865
+ }
866
+ else {
867
+ while (NULL != (term = te->next(te))) {
868
+ /* enough room for for term after converting " to '"' and frequency
869
+ * plus some extra for good measure */
870
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
871
+ capa <<= 1;
872
+ REALLOC_N(json, char, capa);
873
+ }
874
+ *(jp++) = '{';
875
+ memcpy(jp, "\"term\":", 7);
876
+ jp += 7;
877
+ jp = json_concat_string(jp, term);
878
+ *(jp++) = ',';
879
+ memcpy(jp, "\"frequency\":", 12);
880
+ jp += 12;
881
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
882
+ jp += strlen(jp);
883
+ *(jp++) = '}';
884
+ *(jp++) = ',';
885
+ }
886
+ }
887
+ if (*(jp-1) == ',') jp--;
888
+ *(jp++) = ']';
889
+ *jp = '\0';
890
+
891
+ rjson = rb_str_new2(json);
892
+ free(json);
893
+ return rjson;
894
+ }
895
+
803
896
  /****************************************************************************
804
897
  *
805
898
  * TermDocEnum Methods
@@ -960,6 +1053,89 @@ frt_tde_each(VALUE self)
960
1053
  return INT2FIX(doc_cnt);
961
1054
  }
962
1055
 
1056
+ /*
1057
+ * call-seq:
1058
+ * term_doc_enum.to_json() -> string
1059
+ *
1060
+ * Returns a json representation of the term doc enum. It will also add the
1061
+ * term positions if they are available. You can speed this up by having the
1062
+ * method return arrays instead of objects, simply by passing an argument to
1063
+ * the to_json method. For example;
1064
+ *
1065
+ * term_doc_enum.to_json() #=>
1066
+ * # [
1067
+ * # {"document":1,"frequency":12},
1068
+ * # {"document":11,"frequency":1},
1069
+ * # {"document":29,"frequency":120},
1070
+ * # {"document":30,"frequency":3}
1071
+ * # ]
1072
+ *
1073
+ * term_doc_enum.to_json(:fast) #=>
1074
+ * # [
1075
+ * # [1,12],
1076
+ * # [11,1],
1077
+ * # [29,120],
1078
+ * # [30,3]
1079
+ * # ]
1080
+ */
1081
+ static VALUE
1082
+ frt_tde_to_json(int argc, VALUE *argv, VALUE self)
1083
+ {
1084
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1085
+ VALUE rjson;
1086
+ char *json, *jp;
1087
+ int capa = 65536;
1088
+ char *format;
1089
+ char close = (argc > 0) ? ']' : '}';
1090
+ bool do_positions = tde->next_position != NULL;
1091
+ jp = json = ALLOC_N(char, capa);
1092
+ *(jp++) = '[';
1093
+
1094
+ if (do_positions) {
1095
+ if (argc == 0) {
1096
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1097
+ }
1098
+ else {
1099
+ format = "[%d,%d,[";
1100
+ }
1101
+ }
1102
+ else {
1103
+ if (argc == 0) {
1104
+ format = "{\"document\":%d,\"frequency\":%d},";
1105
+ }
1106
+ else {
1107
+ format = "[%d,%d],";
1108
+ }
1109
+ }
1110
+ while (tde->next(tde)) {
1111
+ /* 100 chars should be enough room for an extra entry */
1112
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1113
+ capa <<= 1;
1114
+ REALLOC_N(json, char, capa);
1115
+ }
1116
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1117
+ jp += strlen(jp);
1118
+ if (do_positions) {
1119
+ int pos;
1120
+ while (0 <= (pos = tde->next_position(tde))) {
1121
+ sprintf(jp, "%d,", pos);
1122
+ jp += strlen(jp);
1123
+ }
1124
+ if (*(jp - 1) == ',') jp--;
1125
+ *(jp++) = ']';
1126
+ *(jp++) = close;
1127
+ *(jp++) = ',';
1128
+ }
1129
+ }
1130
+ if (*(jp - 1) == ',') jp--;
1131
+ *(jp++) = ']';
1132
+ *jp = '\0';
1133
+
1134
+ rjson = rb_str_new2(json);
1135
+ free(json);
1136
+ return rjson;
1137
+ }
1138
+
963
1139
  /*
964
1140
  * call-seq:
965
1141
  * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
@@ -2678,6 +2854,7 @@ Init_FieldInfo(void)
2678
2854
  rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2679
2855
 
2680
2856
  rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2857
+ rb_define_method(cFieldInfo, "name", frt_fi_name, 0);
2681
2858
  rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2682
2859
  rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2683
2860
  rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
@@ -2793,6 +2970,7 @@ Init_TermEnum(void)
2793
2970
  rb_define_method(cTermEnum, "each", frt_te_each, 0);
2794
2971
  rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2795
2972
  rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2973
+ rb_define_method(cTermEnum, "to_json", frt_te_to_json, -1);
2796
2974
  }
2797
2975
 
2798
2976
  /*
@@ -2844,6 +3022,7 @@ Init_TermDocEnum(void)
2844
3022
  rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
2845
3023
  rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
2846
3024
  rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
3025
+ rb_define_method(cTermDocEnum, "to_json", frt_tde_to_json, -1);
2847
3026
  }
2848
3027
 
2849
3028
  /* rdochack
data/ext/r_search.c CHANGED
@@ -224,32 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
224
224
  return rstr;
225
225
  }
226
226
 
227
- /*
228
- * Json Exportation - Loading each LazyDoc and formatting them into json
229
- * This code is designed to get a VERY FAST json string, the goal was speed,
230
- * not sexyness.
231
- * Jeremie 'ahFeel' BORDIER
232
- * ahFeel@rift.Fr
233
- */
234
227
  __inline char *
235
- json_concat_string(char *s, char *field)
236
- {
237
- *(s++) = '"';
238
- while (*field) {
239
- if (*field == '\"') {
240
- *(s++) = '\'';
241
- *(s++) = *(field++);
242
- *(s++) = '\'';
243
- }
244
- else {
245
- *(s++) = *(field++);
246
- }
247
- }
248
- *(s++) = '"';
249
- return s;
250
- }
251
-
252
- inline char *
253
228
  frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
254
229
  {
255
230
  int i, j;
@@ -260,7 +235,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
260
235
  for (i = 0; i < lzd->size; i++) {
261
236
  f = lzd->fields[i];
262
237
  /* 3 times length of field to make space for quoted quotes ('"') and
263
- * 4 x field length to make space for '"' around fields and ','
238
+ * 4 times field elements to make space for '"' around fields and ','
264
239
  * between fields. Add 100 for '[', ']' and good safety.
265
240
  */
266
241
  len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
@@ -1632,15 +1607,22 @@ frt_spanmtq_init(VALUE self, VALUE rfield, VALUE rterms)
1632
1607
 
1633
1608
  /*
1634
1609
  * call-seq:
1635
- * SpanPrefixQuery.new(field, prefix) -> query
1610
+ * SpanPrefixQuery.new(field, prefix, max_terms = 256) -> query
1636
1611
  *
1637
1612
  * Create a new SpanPrefixQuery which matches all documents with the prefix
1638
1613
  * +prefix+ in the field +field+.
1639
1614
  */
1640
1615
  static VALUE
1641
- frt_spanprq_init(VALUE self, VALUE rfield, VALUE rprefix)
1616
+ frt_spanprq_init(int argc, VALUE *argv, VALUE self)
1642
1617
  {
1643
- Query *q = spanprq_new(frt_field(rfield), StringValuePtr(rprefix));
1618
+ VALUE rfield, rprefix, rmax_terms;
1619
+ int max_terms = SPAN_PREFIX_QUERY_MAX_TERMS;
1620
+ Query *q;
1621
+ if (rb_scan_args(argc, argv, "21", &rfield, &rprefix, &rmax_terms) == 3) {
1622
+ max_terms = FIX2INT(rmax_terms);
1623
+ }
1624
+ q = spanprq_new(frt_field(rfield), StringValuePtr(rprefix));
1625
+ ((SpanPrefixQuery *)q)->max_terms = max_terms;
1644
1626
  Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
1645
1627
  object_add(q, self);
1646
1628
  return self;
@@ -3556,7 +3538,7 @@ Init_SpanPrefixQuery(void)
3556
3538
  cSpanPrefixQuery = rb_define_class_under(mSpans, "SpanPrefixQuery", cQuery);
3557
3539
  rb_define_alloc_func(cSpanPrefixQuery, frt_data_alloc);
3558
3540
 
3559
- rb_define_method(cSpanPrefixQuery, "initialize", frt_spanprq_init, 2);
3541
+ rb_define_method(cSpanPrefixQuery, "initialize", frt_spanprq_init, -1);
3560
3542
  }
3561
3543
 
3562
3544
  /*
data/ext/search.c CHANGED
@@ -1041,6 +1041,7 @@ static TopDocs *isea_search_w(Searcher *self,
1041
1041
 
1042
1042
  scorer = weight->scorer(weight, ISEA(self)->ir);
1043
1043
  if (!scorer || 0 == ISEA(self)->ir->num_docs(ISEA(self)->ir)) {
1044
+ if (scorer) scorer->destroy(scorer);
1044
1045
  return td_new(0, 0, NULL, 0.0);
1045
1046
  }
1046
1047
 
data/ext/search.h CHANGED
@@ -285,6 +285,7 @@ typedef struct BooleanQuery
285
285
  } BooleanQuery;
286
286
 
287
287
  extern Query *bq_new(bool coord_disabled);
288
+ extern Query *bq_new_max(bool coord_disabled, int max);
288
289
  extern BooleanClause *bq_add_query(Query *self, Query *sub_query,
289
290
  enum BC_TYPE occur);
290
291
  extern BooleanClause *bq_add_query_nr(Query *self, Query *sub_query,
@@ -571,6 +572,7 @@ typedef struct SpanPrefixQuery
571
572
  {
572
573
  SpanQuery super;
573
574
  char *prefix;
575
+ int max_terms;
574
576
  } SpanPrefixQuery;
575
577
 
576
578
  extern Query *spanprq_new(const char *field, const char *prefix);
@@ -868,6 +870,8 @@ typedef struct QParser
868
870
  bool handle_parse_errors : 1;
869
871
  bool allow_any_fields : 1;
870
872
  bool close_def_fields : 1;
873
+ bool destruct : 1;
874
+ bool recovering : 1;
871
875
  } QParser;
872
876
 
873
877
  extern QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
data/ext/store.c CHANGED
@@ -443,6 +443,30 @@ char *is_read_string(InStream *is)
443
443
  return str;
444
444
  }
445
445
 
446
+ char *is_read_string_safe(InStream *is)
447
+ {
448
+ register int length = (int) is_read_vint(is);
449
+ char *str = ALLOC_N(char, length + 1);
450
+ str[length] = '\0';
451
+
452
+ TRY
453
+ if (is->buf.pos > (is->buf.len - length)) {
454
+ register int i;
455
+ for (i = 0; i < length; i++) {
456
+ str[i] = is_read_byte(is);
457
+ }
458
+ }
459
+ else { /* unchecked optimization */
460
+ memcpy(str, is->buf.buf + is->buf.pos, length);
461
+ is->buf.pos += length;
462
+ }
463
+ XCATCHALL
464
+ free(str);
465
+ XENDTRY
466
+
467
+ return str;
468
+ }
469
+
446
470
  void os_write_i32(OutStream *os, f_i32 num)
447
471
  {
448
472
  os_write_byte(os, (uchar)((num >> 24) & 0xFF));