ferret 0.10.11 → 0.10.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/multimapper.h ADDED
@@ -0,0 +1,51 @@
1
+ #ifndef FRT_MAPPER_H
2
+ #define FRT_MAPPER_H
3
+
4
+ #include "hash.h"
5
+
6
+ typedef struct State
7
+ {
8
+ int (*next)(struct State *self, int c, int *states);
9
+ void (*destroy_i)(struct State *self);
10
+ int (*is_match)(struct State *self, char **mapping);
11
+ } State;
12
+
13
+ typedef struct DeterministicState
14
+ {
15
+ struct DeterministicState *next[256];
16
+ int longest_match;
17
+ char *mapping;
18
+ int mapping_len;
19
+ } DeterministicState;
20
+
21
+ typedef struct Mapping
22
+ {
23
+ char *pattern;
24
+ char *replacement;
25
+ } Mapping;
26
+
27
+ typedef struct MultiMapper
28
+ {
29
+ Mapping **mappings;
30
+ int size;
31
+ int capa;
32
+ DeterministicState **dstates;
33
+ int d_size;
34
+ int d_capa;
35
+ unsigned char alphabet[256];
36
+ int a_size;
37
+ HashTable *dstates_map;
38
+ State **nstates;
39
+ int nsize;
40
+ int *next_states;
41
+ int ref_cnt;
42
+ } MultiMapper;
43
+
44
+ extern MultiMapper *mulmap_new();
45
+ extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
46
+ extern void mulmap_compile(MultiMapper *self);
47
+ extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
48
+ extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
49
+ extern void mulmap_destroy(MultiMapper *self);
50
+
51
+ #endif
data/ext/r_analysis.c CHANGED
@@ -20,6 +20,7 @@ static VALUE cRegExpTokenizer;
20
20
  static VALUE cAsciiLowerCaseFilter;
21
21
  static VALUE cLowerCaseFilter;
22
22
  static VALUE cStopFilter;
23
+ static VALUE cMappingFilter;
23
24
  static VALUE cHyphenFilter;
24
25
  static VALUE cStemFilter;
25
26
 
@@ -48,13 +49,11 @@ static VALUE object_space;
48
49
  extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
49
50
  int, struct re_registers *);
50
51
 
51
- /*
52
- static int
52
+ int
53
53
  frt_rb_hash_size(VALUE hash)
54
54
  {
55
55
  return RHASH(hash)->tbl->num_entries;
56
56
  }
57
- */
58
57
 
59
58
  /****************************************************************************
60
59
  *
@@ -468,8 +467,8 @@ frt_ts_get_text(VALUE self)
468
467
  VALUE rtext = Qnil;
469
468
  TokenStream *ts;
470
469
  Data_Get_Struct(self, TokenStream, ts);
471
- if (ts->text) {
472
- if ((rtext = object_get(&ts->text)) == Qnil) {
470
+ if ((rtext = object_get(&ts->text)) == Qnil) {
471
+ if (ts->text) {
473
472
  rtext = rb_str_new2(ts->text);
474
473
  object_set(&ts->text, rtext);
475
474
  }
@@ -539,7 +538,7 @@ typedef struct CWrappedTokenStream {
539
538
  static void
540
539
  cwrts_destroy_i(TokenStream *ts)
541
540
  {
542
- rb_hash_delete(object_space, LONG2NUM(CWTS(ts)->rts));
541
+ rb_hash_delete(object_space, ((long)ts)|1);
543
542
  /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
544
543
  free(ts);
545
544
  }
@@ -563,7 +562,8 @@ static TokenStream *
563
562
  cwrts_clone_i(TokenStream *orig_ts)
564
563
  {
565
564
  TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
566
- CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
565
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
566
+ rb_hash_aset(object_space, ((long)new_ts)|1, rts);
567
567
  return new_ts;
568
568
  }
569
569
 
@@ -583,7 +583,7 @@ frt_get_cwrapped_rts(VALUE rts)
583
583
  ts->clone_i = &cwrts_clone_i;
584
584
  ts->destroy_i = &cwrts_destroy_i;
585
585
  /* prevent from being garbage collected */
586
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
586
+ rb_hash_aset(object_space, ((long)ts)|1, rts);
587
587
  ts->ref_cnt = 1;
588
588
  }
589
589
  return ts;
@@ -621,6 +621,8 @@ typedef struct RegExpTokenStream {
621
621
  static void
622
622
  rets_destroy_i(TokenStream *ts)
623
623
  {
624
+ rb_hash_delete(object_space, ((long)ts)|1);
625
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
624
626
  free(ts);
625
627
  }
626
628
 
@@ -656,6 +658,7 @@ frt_rets_set_text(VALUE self, VALUE rtext)
656
658
  TokenStream *ts;
657
659
  GET_TS(ts, self);
658
660
 
661
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
659
662
  StringValue(rtext);
660
663
  RETS(ts)->rtext = rtext;
661
664
  RETS(ts)->curr_ind = 0;
@@ -723,12 +726,12 @@ rets_clone_i(TokenStream *orig_ts)
723
726
  static TokenStream *
724
727
  rets_new(VALUE rtext, VALUE regex, VALUE proc)
725
728
  {
726
- TokenStream *ts;
729
+ TokenStream *ts = ts_new(RegExpTokenStream);
727
730
 
728
731
  if (rtext != Qnil) {
729
732
  rtext = StringValue(rtext);
733
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
730
734
  }
731
- ts = ts_new(RegExpTokenStream);
732
735
  ts->reset = &rets_reset;
733
736
  ts->next = &rets_next;
734
737
  ts->clone_i = &rets_clone_i;
@@ -769,9 +772,6 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
769
772
 
770
773
  Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
771
774
  object_add(ts, self);
772
- /* no need to add to object space as it is going to ruby space
773
- * rb_hash_aset(object_space, LONG2NUM((long)self), self);
774
- */
775
775
  return self;
776
776
  }
777
777
 
@@ -973,6 +973,96 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
973
973
  return self;
974
974
  }
975
975
 
976
+ static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
977
+ {
978
+ switch (TYPE(from)) {
979
+ case T_STRING:
980
+ mapping_filter_add(mf, RSTRING(from)->ptr, to);
981
+ break;
982
+ case T_SYMBOL:
983
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
984
+ break;
985
+ default:
986
+ rb_raise(rb_eArgError,
987
+ "cannot map from %s with MappingFilter",
988
+ RSTRING(rb_obj_as_string(from))->ptr);
989
+ break;
990
+ }
991
+ }
992
+
993
+ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
994
+ {
995
+ if (key == Qundef) {
996
+ return ST_CONTINUE;
997
+ } else {
998
+ TokenStream *mf = (TokenStream *)arg;
999
+ char *to;
1000
+ switch (TYPE(value)) {
1001
+ case T_STRING:
1002
+ to = RSTRING(value)->ptr;
1003
+ break;
1004
+ case T_SYMBOL:
1005
+ to = rb_id2name(SYM2ID(value));
1006
+ break;
1007
+ default:
1008
+ rb_raise(rb_eArgError,
1009
+ "cannot map to %s with MappingFilter",
1010
+ RSTRING(rb_obj_as_string(key))->ptr);
1011
+ break;
1012
+ }
1013
+ if (TYPE(key) == T_ARRAY) {
1014
+ int i;
1015
+ for (i = RARRAY(key)->len - 1; i >= 0; i--) {
1016
+ frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
1017
+ }
1018
+ }
1019
+ else {
1020
+ frt_add_mapping_i(mf, key, to);
1021
+ }
1022
+ }
1023
+ return ST_CONTINUE;
1024
+ }
1025
+
1026
+
1027
+ /*
1028
+ * call-seq:
1029
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1030
+ *
1031
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1032
+ * to map UTF-8 characters to ascii characters for easier searching and
1033
+ * better searche recall. The mapping is compiled into a Deterministic Finite
1034
+ * Automata so it is super fast. This Filter can therefor be used for
1035
+ * indexing very large datasets. Currently regular expressions are not
1036
+ * supported. If you are really interested in the feature, please contact me
1037
+ * at dbalmain@gmail.com.
1038
+ *
1039
+ * token_stream:: TokenStream to be filtered
1040
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1041
+ * String or an Array of Strings. The value must be a String
1042
+ *
1043
+ * == Example
1044
+ *
1045
+ * filt = MappingFilter.new(token_stream,
1046
+ * {
1047
+ * ['à','á','â','ã','ä','å'] => 'a',
1048
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1049
+ * })
1050
+ */
1051
+ static VALUE
1052
+ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1053
+ {
1054
+ TokenStream *ts;
1055
+ ts = frt_get_cwrapped_rts(rsub_ts);
1056
+ ts = mapping_filter_new(ts);
1057
+ rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1058
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1059
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1060
+
1061
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1062
+ object_add(ts, self);
1063
+ return self;
1064
+ }
1065
+
976
1066
  /*
977
1067
  * call-seq:
978
1068
  * StemFilter.new(token_stream) -> token_stream
@@ -1031,7 +1121,7 @@ typedef struct CWrappedAnalyzer
1031
1121
  static void
1032
1122
  cwa_destroy_i(Analyzer *a)
1033
1123
  {
1034
- rb_hash_delete(object_space, LONG2NUM(CWA(a)->ranalyzer));
1124
+ rb_hash_delete(object_space, ((long)a)|1);
1035
1125
  /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1036
1126
  free(a);
1037
1127
  }
@@ -1059,7 +1149,7 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1059
1149
  a->ref_cnt = 1;
1060
1150
  ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1061
1151
  /* prevent from being garbage collected */
1062
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1152
+ rb_hash_aset(object_space, ((long)a)|1, ranalyzer);
1063
1153
  }
1064
1154
  return a;
1065
1155
  }
@@ -1100,6 +1190,8 @@ frt_get_analyzer(Analyzer *a)
1100
1190
  static VALUE
1101
1191
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1102
1192
  {
1193
+ /* NOTE: Any changes made to this method may also need to be applied to
1194
+ * frt_re_analyzer_token_stream */
1103
1195
  TokenStream *ts;
1104
1196
  Analyzer *a;
1105
1197
  GET_A(a, self);
@@ -1121,7 +1213,7 @@ lower = (argc ? RTEST(rlower) : dflt)
1121
1213
 
1122
1214
  /*
1123
1215
  * call-seq:
1124
- * AsciiWhiteSpaceAnalyzer.new(lower = true) -> analyzer
1216
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1125
1217
  *
1126
1218
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1127
1219
  * but can optionally leave case as is. Lowercasing will only be done to
@@ -1142,7 +1234,7 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1142
1234
 
1143
1235
  /*
1144
1236
  * call-seq:
1145
- * WhiteSpaceAnalyzer.new(lower = true) -> analyzer
1237
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1146
1238
  *
1147
1239
  * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1148
1240
  * optionally leave case as is. Lowercasing will be done based on the current
@@ -1220,7 +1312,7 @@ get_rstopwords(const char **stop_words)
1220
1312
 
1221
1313
  /*
1222
1314
  * call-seq:
1223
- * AsciiStandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1315
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1224
1316
  * -> analyzer
1225
1317
  *
1226
1318
  * Create a new AsciiStandardAnalyzer which downcases tokens by default but
@@ -1253,7 +1345,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1253
1345
 
1254
1346
  /*
1255
1347
  * call-seq:
1256
- * StandardAnalyzer.new(stop_words=ENGLISH_STOP_WORDS, lower=true)
1348
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1257
1349
  * -> analyzer
1258
1350
  *
1259
1351
  * Create a new StandardAnalyzer which downcases tokens by default but can
@@ -1377,7 +1469,6 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1377
1469
 
1378
1470
  ts = rets_new(Qnil, regex, proc);
1379
1471
  rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1380
- /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1381
1472
  object_add(ts, rets);
1382
1473
 
1383
1474
  if (lower != Qfalse) {
@@ -1392,6 +1483,41 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1392
1483
  return self;
1393
1484
  }
1394
1485
 
1486
+ /*
1487
+ * call-seq:
1488
+ * analyzer.token_stream(field_name, input) -> token_stream
1489
+ *
1490
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1491
+ * also depend on the +field_name+. Although this parameter is typically
1492
+ * ignored.
1493
+ *
1494
+ * field_name:: name of the field to be tokenized
1495
+ * input:: data from the field to be tokenized
1496
+ */
1497
+ static VALUE
1498
+ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1499
+ {
1500
+ TokenStream *ts;
1501
+ Analyzer *a;
1502
+ GET_A(a, self);
1503
+
1504
+ StringValue(rtext);
1505
+
1506
+ ts = a_get_ts(a, frt_field(rfield), RSTRING(rtext)->ptr);
1507
+
1508
+ /* Make sure that there is no entry already */
1509
+ object_set(&ts->text, rtext);
1510
+ if (ts->next == &rets_next) {
1511
+ RETS(ts)->rtext = rtext;
1512
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
1513
+ }
1514
+ else {
1515
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1516
+ rb_hash_aset(object_space, ((long)((TokenFilter*)ts)->sub_ts)|1, rtext);
1517
+ }
1518
+ return get_rb_token_stream(ts);
1519
+ }
1520
+
1395
1521
  /****************************************************************************
1396
1522
  *
1397
1523
  * Locale stuff
@@ -1728,6 +1854,55 @@ static void Init_HyphenFilter(void)
1728
1854
  rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1729
1855
  }
1730
1856
 
1857
+ /*
1858
+ * Document-class: Ferret::Analysis::MappingFilter
1859
+ *
1860
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1861
+ * characters to ascii characters for easier searching and better searche
1862
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
1863
+ * is super fast. This Filter can therefor be used for indexing very large
1864
+ * datasets. Currently regular expressions are not supported. If you are
1865
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
1866
+ *
1867
+ * == Example
1868
+ *
1869
+ * mapping = {
1870
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
1871
+ * 'æ' => 'ae',
1872
+ * ['ď','đ'] => 'd',
1873
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
1874
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
1875
+ * ['ƒ'] => 'f',
1876
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
1877
+ * ['ĥ','ħ'] => 'h',
1878
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
1879
+ * ['į','ı','ij','ĵ'] => 'j',
1880
+ * ['ķ','ĸ'] => 'k',
1881
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
1882
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
1883
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
1884
+ * ['œ'] => 'oek',
1885
+ * ['ą'] => 'q',
1886
+ * ['ŕ','ř','ŗ'] => 'r',
1887
+ * ['ś','š','ş','ŝ','ș'] => 's',
1888
+ * ['ť','ţ','ŧ','ț'] => 't',
1889
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
1890
+ * ['ŵ'] => 'w',
1891
+ * ['ý','ÿ','ŷ'] => 'y',
1892
+ * ['ž','ż','ź'] => 'z'
1893
+ * }
1894
+ * filt = MappingFilter.new(token_stream, mapping)
1895
+ */
1896
+ static void Init_MappingFilter(void)
1897
+ {
1898
+ cMappingFilter =
1899
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
1900
+ frt_mark_cclass(cMappingFilter);
1901
+ rb_define_alloc_func(cMappingFilter, frt_data_alloc);
1902
+ rb_define_method(cMappingFilter, "initialize",
1903
+ frt_mapping_filter_init, 2);
1904
+ }
1905
+
1731
1906
  /*
1732
1907
  * Document-class: Ferret::Analysis::StopFilter
1733
1908
  *
@@ -1999,7 +2174,7 @@ static void Init_WhiteSpaceAnalyzer(void)
1999
2174
  * ascii-analyzers. If it were implemented in Ruby it would look like this;
2000
2175
  *
2001
2176
  * class AsciiStandardAnalyzer
2002
- * def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
2177
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2003
2178
  * @lower = lower
2004
2179
  * @stop_words = stop_words
2005
2180
  * end
@@ -2036,7 +2211,7 @@ static void Init_AsciiStandardAnalyzer(void)
2036
2211
  * it were implemented in Ruby it would look like this;
2037
2212
  *
2038
2213
  * class StandardAnalyzer
2039
- * def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
2214
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2040
2215
  * @lower = lower
2041
2216
  * @stop_words = stop_words
2042
2217
  * end
@@ -2131,6 +2306,8 @@ static void Init_RegExpAnalyzer(void)
2131
2306
  rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2132
2307
  rb_define_method(cRegExpAnalyzer, "initialize",
2133
2308
  frt_re_analyzer_init, -1);
2309
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2310
+ frt_re_analyzer_token_stream, 2);
2134
2311
  }
2135
2312
 
2136
2313
  /* rdoc hack
@@ -2244,6 +2421,7 @@ Init_Analysis(void)
2244
2421
  Init_LowerCaseFilter();
2245
2422
  Init_HyphenFilter();
2246
2423
  Init_StopFilter();
2424
+ Init_MappingFilter();
2247
2425
  Init_StemFilter();
2248
2426
 
2249
2427
  Init_Analyzer();
data/ext/r_search.c CHANGED
@@ -124,7 +124,6 @@ extern VALUE cIndexReader;
124
124
  extern void frt_ir_free(void *p);
125
125
  extern void frt_ir_mark(void *p);
126
126
 
127
-
128
127
  extern void frt_set_term(VALUE rterm, Term *t);
129
128
  extern VALUE frt_get_analyzer(Analyzer *a);
130
129
  extern HashSet *frt_get_fields(VALUE rfields);
@@ -223,6 +222,113 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
223
222
  return rstr;
224
223
  }
225
224
 
225
+ /*
226
+ * Json Exportation - Loading each LazyDoc and formatting them into json
227
+ * This code is designed to get a VERY FAST json string, the goal was speed,
228
+ * not sexyness.
229
+ * Jeremie 'ahFeel' BORDIER
230
+ * ahFeel@rift.Fr
231
+ */
232
+ __inline char *
233
+ json_concat_string(char *s, char *field)
234
+ {
235
+ *(s++) = '"';
236
+ while (*field) {
237
+ if (*field == '\"') {
238
+ *(s++) = '\'';
239
+ *(s++) = *(field++);
240
+ *(s++) = '\'';
241
+ }
242
+ else {
243
+ *(s++) = *(field++);
244
+ }
245
+ }
246
+ *(s++) = '"';
247
+ return s;
248
+ }
249
+
250
+ inline char *
251
+ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
252
+ {
253
+ int i, j;
254
+ int diff = s - *str;
255
+ int len = diff, l;
256
+ LazyDocField *f;
257
+
258
+ for (i = 0; i < lzd->size; i++) {
259
+ f = lzd->fields[i];
260
+ /* 3 times length of field to make space for quoted quotes ('"') and
261
+ * 4 x field length to make space for '"' around fields and ','
262
+ * between fields. Add 100 for '[', ']' and good safety.
263
+ */
264
+ len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
265
+ }
266
+
267
+ if (len > *slen) {
268
+ while (len > *slen) *slen = *slen << 1;
269
+ REALLOC_N(*str, char, *slen);
270
+ s = *str + diff;
271
+ }
272
+
273
+ for (i = 0; i < lzd->size; i++) {
274
+ f = lzd->fields[i];
275
+ if (i) *(s++) = ',';
276
+ *(s++) = '"';
277
+ l = strlen(f->name);
278
+ memcpy(s, f->name, l);
279
+ s += l;
280
+ *(s++) = '"';
281
+ *(s++) = ':';
282
+ if (f->size > 1) *(s++) = '[';
283
+ for (j = 0; j < f->size; j++) {
284
+ if (j) *(s++) = ',';
285
+ s = json_concat_string(s, lazy_df_get_data(f, j));
286
+ }
287
+ if (f->size > 1) *(s++) = ']';
288
+ }
289
+ return s;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * top_doc.to_json() -> string
295
+ *
296
+ * Returns a json represention of the top_doc.
297
+ */
298
+ static VALUE
299
+ frt_td_to_json(VALUE self)
300
+ {
301
+ int i;
302
+ VALUE rhits = rb_funcall(self, id_hits, 0);
303
+ VALUE rhit;
304
+ LazyDoc *lzd;
305
+ Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
306
+ const int num_hits = RARRAY(rhits)->len;
307
+ int doc_id;
308
+ int len = 32768;
309
+ char *str = ALLOC_N(char, len);
310
+ char *s = str;
311
+ VALUE rstr;
312
+
313
+ *(s++) = '[';
314
+ for (i = 0; i < num_hits; i++) {
315
+ if (i) *(s++) = ',';
316
+ *(s++) = '{';
317
+ rhit = RARRAY(rhits)->ptr[i];
318
+ doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
319
+ lzd = sea->get_lazy_doc(sea, doc_id);
320
+ s = frt_lzd_load_to_json(lzd, &str, s, &len);
321
+ lazy_doc_close(lzd);
322
+ *(s++) = '}';
323
+ }
324
+ *(s++) = ']';
325
+ *(s++) = '\0';
326
+ rstr = rb_str_new2(str);
327
+ free(str);
328
+ return rstr;
329
+ }
330
+
331
+
226
332
  /****************************************************************************
227
333
  *
228
334
  * Explanation Methods
@@ -1901,6 +2007,7 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
1901
2007
  VALUE rval;
1902
2008
  int type = SORT_TYPE_AUTO;
1903
2009
  int is_reverse = false;
2010
+ char *field;
1904
2011
 
1905
2012
  if (rb_scan_args(argc, argv, "11", &rfield, &roptions) == 2) {
1906
2013
  if (Qnil != (rval = rb_hash_aref(roptions, sym_type))) {
@@ -1914,11 +2021,11 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
1914
2021
  }
1915
2022
  }
1916
2023
  if (NIL_P(rfield)) rb_raise(rb_eArgError, "must pass a valid field name");
1917
- rfield = rb_obj_as_string(rfield);
2024
+ field = frt_field(rfield);
1918
2025
 
1919
- sf = sort_field_new(RSTRING(rfield)->ptr, type, is_reverse);
1920
- if (sf->field == NULL && RSTRING(rfield)->ptr != NULL) {
1921
- sf->field = estrdup(RSTRING(rfield)->ptr);
2026
+ sf = sort_field_new(field, type, is_reverse);
2027
+ if (sf->field == NULL && field) {
2028
+ sf->field = estrdup(field);
1922
2029
  }
1923
2030
 
1924
2031
  Frt_Wrap_Struct(self, NULL, &frt_sf_free, sf);
@@ -2017,7 +2124,6 @@ frt_sort_free(void *p)
2017
2124
  {
2018
2125
  Sort *sort = (Sort *)p;
2019
2126
  object_del(sort);
2020
- object_del(sort->sort_fields);
2021
2127
  sort_destroy(sort);
2022
2128
  }
2023
2129
 
@@ -2025,7 +2131,10 @@ static void
2025
2131
  frt_sort_mark(void *p)
2026
2132
  {
2027
2133
  Sort *sort = (Sort *)p;
2028
- frt_gc_mark(sort->sort_fields);
2134
+ int i;
2135
+ for (i = 0; i < sort->size; i++) {
2136
+ frt_gc_mark(sort->sort_fields[i]);
2137
+ }
2029
2138
  }
2030
2139
 
2031
2140
  static VALUE
@@ -2147,11 +2256,6 @@ frt_sort_init(int argc, VALUE *argv, VALUE self)
2147
2256
  sort_add_sort_field(sort, (SortField *)&SORT_FIELD_SCORE);
2148
2257
  sort_add_sort_field(sort, (SortField *)&SORT_FIELD_DOC);
2149
2258
  }
2150
- rfields = rb_ary_new2(sort->size);
2151
- for (i = 0; i < sort->size; i++) {
2152
- rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
2153
- }
2154
- object_add(sort->sort_fields, rfields);
2155
2259
 
2156
2260
  return self;
2157
2261
  }
@@ -2166,7 +2270,12 @@ static VALUE
2166
2270
  frt_sort_get_fields(VALUE self)
2167
2271
  {
2168
2272
  GET_SORT();
2169
- return object_get(sort->sort_fields);
2273
+ VALUE rfields = rb_ary_new2(sort->size);
2274
+ int i;
2275
+ for (i = 0; i < sort->size; i++) {
2276
+ rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
2277
+ }
2278
+ return rfields;
2170
2279
  }
2171
2280
 
2172
2281
 
@@ -2374,9 +2483,9 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2374
2483
  sea->arg = (void *)rval;
2375
2484
  }
2376
2485
  if (Qnil != (rval = rb_hash_aref(roptions, sym_sort))) {
2377
- if (TYPE(rval) != T_DATA) {
2486
+ if (TYPE(rval) != T_DATA || CLASS_OF(rval) == cSortField) {
2378
2487
  rval = frt_sort_init(1, &rval, frt_sort_alloc(cSort));
2379
- }
2488
+ }
2380
2489
  Data_Get_Struct(rval, Sort, sort);
2381
2490
  }
2382
2491
  }
@@ -2801,6 +2910,7 @@ Init_TopDocs(void)
2801
2910
  rb_set_class_path(cTopDocs, mSearch, td_class);
2802
2911
  rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
2803
2912
  rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
2913
+ rb_define_method(cTopDocs, "to_json", frt_td_to_json, 0);
2804
2914
  id_hits = rb_intern("hits");
2805
2915
  id_total_hits = rb_intern("total_hits");
2806
2916
  id_max_score = rb_intern("max_score");