ferret 0.10.11 → 0.10.12

Sign up to get free protection for your applications and to get access to all the features.
data/ext/multimapper.h ADDED
@@ -0,0 +1,51 @@
1
+ #ifndef FRT_MAPPER_H
2
+ #define FRT_MAPPER_H
3
+
4
+ #include "hash.h"
5
+
6
+ typedef struct State
7
+ {
8
+ int (*next)(struct State *self, int c, int *states);
9
+ void (*destroy_i)(struct State *self);
10
+ int (*is_match)(struct State *self, char **mapping);
11
+ } State;
12
+
13
+ typedef struct DeterministicState
14
+ {
15
+ struct DeterministicState *next[256];
16
+ int longest_match;
17
+ char *mapping;
18
+ int mapping_len;
19
+ } DeterministicState;
20
+
21
+ typedef struct Mapping
22
+ {
23
+ char *pattern;
24
+ char *replacement;
25
+ } Mapping;
26
+
27
+ typedef struct MultiMapper
28
+ {
29
+ Mapping **mappings;
30
+ int size;
31
+ int capa;
32
+ DeterministicState **dstates;
33
+ int d_size;
34
+ int d_capa;
35
+ unsigned char alphabet[256];
36
+ int a_size;
37
+ HashTable *dstates_map;
38
+ State **nstates;
39
+ int nsize;
40
+ int *next_states;
41
+ int ref_cnt;
42
+ } MultiMapper;
43
+
44
+ extern MultiMapper *mulmap_new();
45
+ extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
46
+ extern void mulmap_compile(MultiMapper *self);
47
+ extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
48
+ extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
49
+ extern void mulmap_destroy(MultiMapper *self);
50
+
51
+ #endif
data/ext/r_analysis.c CHANGED
@@ -20,6 +20,7 @@ static VALUE cRegExpTokenizer;
20
20
  static VALUE cAsciiLowerCaseFilter;
21
21
  static VALUE cLowerCaseFilter;
22
22
  static VALUE cStopFilter;
23
+ static VALUE cMappingFilter;
23
24
  static VALUE cHyphenFilter;
24
25
  static VALUE cStemFilter;
25
26
 
@@ -48,13 +49,11 @@ static VALUE object_space;
48
49
  extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
49
50
  int, struct re_registers *);
50
51
 
51
- /*
52
- static int
52
+ int
53
53
  frt_rb_hash_size(VALUE hash)
54
54
  {
55
55
  return RHASH(hash)->tbl->num_entries;
56
56
  }
57
- */
58
57
 
59
58
  /****************************************************************************
60
59
  *
@@ -468,8 +467,8 @@ frt_ts_get_text(VALUE self)
468
467
  VALUE rtext = Qnil;
469
468
  TokenStream *ts;
470
469
  Data_Get_Struct(self, TokenStream, ts);
471
- if (ts->text) {
472
- if ((rtext = object_get(&ts->text)) == Qnil) {
470
+ if ((rtext = object_get(&ts->text)) == Qnil) {
471
+ if (ts->text) {
473
472
  rtext = rb_str_new2(ts->text);
474
473
  object_set(&ts->text, rtext);
475
474
  }
@@ -539,7 +538,7 @@ typedef struct CWrappedTokenStream {
539
538
  static void
540
539
  cwrts_destroy_i(TokenStream *ts)
541
540
  {
542
- rb_hash_delete(object_space, LONG2NUM(CWTS(ts)->rts));
541
+ rb_hash_delete(object_space, ((long)ts)|1);
543
542
  /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
544
543
  free(ts);
545
544
  }
@@ -563,7 +562,8 @@ static TokenStream *
563
562
  cwrts_clone_i(TokenStream *orig_ts)
564
563
  {
565
564
  TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
566
- CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
565
+ VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
566
+ rb_hash_aset(object_space, ((long)new_ts)|1, rts);
567
567
  return new_ts;
568
568
  }
569
569
 
@@ -583,7 +583,7 @@ frt_get_cwrapped_rts(VALUE rts)
583
583
  ts->clone_i = &cwrts_clone_i;
584
584
  ts->destroy_i = &cwrts_destroy_i;
585
585
  /* prevent from being garbage collected */
586
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
586
+ rb_hash_aset(object_space, ((long)ts)|1, rts);
587
587
  ts->ref_cnt = 1;
588
588
  }
589
589
  return ts;
@@ -621,6 +621,8 @@ typedef struct RegExpTokenStream {
621
621
  static void
622
622
  rets_destroy_i(TokenStream *ts)
623
623
  {
624
+ rb_hash_delete(object_space, ((long)ts)|1);
625
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
624
626
  free(ts);
625
627
  }
626
628
 
@@ -656,6 +658,7 @@ frt_rets_set_text(VALUE self, VALUE rtext)
656
658
  TokenStream *ts;
657
659
  GET_TS(ts, self);
658
660
 
661
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
659
662
  StringValue(rtext);
660
663
  RETS(ts)->rtext = rtext;
661
664
  RETS(ts)->curr_ind = 0;
@@ -723,12 +726,12 @@ rets_clone_i(TokenStream *orig_ts)
723
726
  static TokenStream *
724
727
  rets_new(VALUE rtext, VALUE regex, VALUE proc)
725
728
  {
726
- TokenStream *ts;
729
+ TokenStream *ts = ts_new(RegExpTokenStream);
727
730
 
728
731
  if (rtext != Qnil) {
729
732
  rtext = StringValue(rtext);
733
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
730
734
  }
731
- ts = ts_new(RegExpTokenStream);
732
735
  ts->reset = &rets_reset;
733
736
  ts->next = &rets_next;
734
737
  ts->clone_i = &rets_clone_i;
@@ -769,9 +772,6 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
769
772
 
770
773
  Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
771
774
  object_add(ts, self);
772
- /* no need to add to object space as it is going to ruby space
773
- * rb_hash_aset(object_space, LONG2NUM((long)self), self);
774
- */
775
775
  return self;
776
776
  }
777
777
 
@@ -973,6 +973,96 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
973
973
  return self;
974
974
  }
975
975
 
976
+ static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
977
+ {
978
+ switch (TYPE(from)) {
979
+ case T_STRING:
980
+ mapping_filter_add(mf, RSTRING(from)->ptr, to);
981
+ break;
982
+ case T_SYMBOL:
983
+ mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
984
+ break;
985
+ default:
986
+ rb_raise(rb_eArgError,
987
+ "cannot map from %s with MappingFilter",
988
+ RSTRING(rb_obj_as_string(from))->ptr);
989
+ break;
990
+ }
991
+ }
992
+
993
+ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
994
+ {
995
+ if (key == Qundef) {
996
+ return ST_CONTINUE;
997
+ } else {
998
+ TokenStream *mf = (TokenStream *)arg;
999
+ char *to;
1000
+ switch (TYPE(value)) {
1001
+ case T_STRING:
1002
+ to = RSTRING(value)->ptr;
1003
+ break;
1004
+ case T_SYMBOL:
1005
+ to = rb_id2name(SYM2ID(value));
1006
+ break;
1007
+ default:
1008
+ rb_raise(rb_eArgError,
1009
+ "cannot map to %s with MappingFilter",
1010
+ RSTRING(rb_obj_as_string(key))->ptr);
1011
+ break;
1012
+ }
1013
+ if (TYPE(key) == T_ARRAY) {
1014
+ int i;
1015
+ for (i = RARRAY(key)->len - 1; i >= 0; i--) {
1016
+ frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
1017
+ }
1018
+ }
1019
+ else {
1020
+ frt_add_mapping_i(mf, key, to);
1021
+ }
1022
+ }
1023
+ return ST_CONTINUE;
1024
+ }
1025
+
1026
+
1027
+ /*
1028
+ * call-seq:
1029
+ * MappingFilter.new(token_stream, mapping) -> token_stream
1030
+ *
1031
+ * Create an MappingFilter which maps strings in tokens. This is usually used
1032
+ * to map UTF-8 characters to ascii characters for easier searching and
1033
+ * better searche recall. The mapping is compiled into a Deterministic Finite
1034
+ * Automata so it is super fast. This Filter can therefor be used for
1035
+ * indexing very large datasets. Currently regular expressions are not
1036
+ * supported. If you are really interested in the feature, please contact me
1037
+ * at dbalmain@gmail.com.
1038
+ *
1039
+ * token_stream:: TokenStream to be filtered
1040
+ * mapping:: Hash of mappings to apply to tokens. The key can be a
1041
+ * String or an Array of Strings. The value must be a String
1042
+ *
1043
+ * == Example
1044
+ *
1045
+ * filt = MappingFilter.new(token_stream,
1046
+ * {
1047
+ * ['à','á','â','ã','ä','å'] => 'a',
1048
+ * ['è','é','ê','ë','ē','ę'] => 'e'
1049
+ * })
1050
+ */
1051
+ static VALUE
1052
+ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1053
+ {
1054
+ TokenStream *ts;
1055
+ ts = frt_get_cwrapped_rts(rsub_ts);
1056
+ ts = mapping_filter_new(ts);
1057
+ rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1058
+ mulmap_compile(((MappingFilter *)ts)->mapper);
1059
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1060
+
1061
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1062
+ object_add(ts, self);
1063
+ return self;
1064
+ }
1065
+
976
1066
  /*
977
1067
  * call-seq:
978
1068
  * StemFilter.new(token_stream) -> token_stream
@@ -1031,7 +1121,7 @@ typedef struct CWrappedAnalyzer
1031
1121
  static void
1032
1122
  cwa_destroy_i(Analyzer *a)
1033
1123
  {
1034
- rb_hash_delete(object_space, LONG2NUM(CWA(a)->ranalyzer));
1124
+ rb_hash_delete(object_space, ((long)a)|1);
1035
1125
  /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1036
1126
  free(a);
1037
1127
  }
@@ -1059,7 +1149,7 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1059
1149
  a->ref_cnt = 1;
1060
1150
  ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1061
1151
  /* prevent from being garbage collected */
1062
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1152
+ rb_hash_aset(object_space, ((long)a)|1, ranalyzer);
1063
1153
  }
1064
1154
  return a;
1065
1155
  }
@@ -1100,6 +1190,8 @@ frt_get_analyzer(Analyzer *a)
1100
1190
  static VALUE
1101
1191
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1102
1192
  {
1193
+ /* NOTE: Any changes made to this method may also need to be applied to
1194
+ * frt_re_analyzer_token_stream */
1103
1195
  TokenStream *ts;
1104
1196
  Analyzer *a;
1105
1197
  GET_A(a, self);
@@ -1121,7 +1213,7 @@ lower = (argc ? RTEST(rlower) : dflt)
1121
1213
 
1122
1214
  /*
1123
1215
  * call-seq:
1124
- * AsciiWhiteSpaceAnalyzer.new(lower = true) -> analyzer
1216
+ * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1125
1217
  *
1126
1218
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1127
1219
  * but can optionally leave case as is. Lowercasing will only be done to
@@ -1142,7 +1234,7 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1142
1234
 
1143
1235
  /*
1144
1236
  * call-seq:
1145
- * WhiteSpaceAnalyzer.new(lower = true) -> analyzer
1237
+ * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
1146
1238
  *
1147
1239
  * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1148
1240
  * optionally leave case as is. Lowercasing will be done based on the current
@@ -1220,7 +1312,7 @@ get_rstopwords(const char **stop_words)
1220
1312
 
1221
1313
  /*
1222
1314
  * call-seq:
1223
- * AsciiStandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1315
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
1224
1316
  * -> analyzer
1225
1317
  *
1226
1318
  * Create a new AsciiStandardAnalyzer which downcases tokens by default but
@@ -1253,7 +1345,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1253
1345
 
1254
1346
  /*
1255
1347
  * call-seq:
1256
- * StandardAnalyzer.new(stop_words=ENGLISH_STOP_WORDS, lower=true)
1348
+ * StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
1257
1349
  * -> analyzer
1258
1350
  *
1259
1351
  * Create a new StandardAnalyzer which downcases tokens by default but can
@@ -1377,7 +1469,6 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1377
1469
 
1378
1470
  ts = rets_new(Qnil, regex, proc);
1379
1471
  rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1380
- /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1381
1472
  object_add(ts, rets);
1382
1473
 
1383
1474
  if (lower != Qfalse) {
@@ -1392,6 +1483,41 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1392
1483
  return self;
1393
1484
  }
1394
1485
 
1486
+ /*
1487
+ * call-seq:
1488
+ * analyzer.token_stream(field_name, input) -> token_stream
1489
+ *
1490
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1491
+ * also depend on the +field_name+. Although this parameter is typically
1492
+ * ignored.
1493
+ *
1494
+ * field_name:: name of the field to be tokenized
1495
+ * input:: data from the field to be tokenized
1496
+ */
1497
+ static VALUE
1498
+ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1499
+ {
1500
+ TokenStream *ts;
1501
+ Analyzer *a;
1502
+ GET_A(a, self);
1503
+
1504
+ StringValue(rtext);
1505
+
1506
+ ts = a_get_ts(a, frt_field(rfield), RSTRING(rtext)->ptr);
1507
+
1508
+ /* Make sure that there is no entry already */
1509
+ object_set(&ts->text, rtext);
1510
+ if (ts->next == &rets_next) {
1511
+ RETS(ts)->rtext = rtext;
1512
+ rb_hash_aset(object_space, ((long)ts)|1, rtext);
1513
+ }
1514
+ else {
1515
+ RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
1516
+ rb_hash_aset(object_space, ((long)((TokenFilter*)ts)->sub_ts)|1, rtext);
1517
+ }
1518
+ return get_rb_token_stream(ts);
1519
+ }
1520
+
1395
1521
  /****************************************************************************
1396
1522
  *
1397
1523
  * Locale stuff
@@ -1728,6 +1854,55 @@ static void Init_HyphenFilter(void)
1728
1854
  rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1729
1855
  }
1730
1856
 
1857
+ /*
1858
+ * Document-class: Ferret::Analysis::MappingFilter
1859
+ *
1860
+ * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1861
+ * characters to ascii characters for easier searching and better searche
1862
+ * recall. The mapping is compiled into a Deterministic Finite Automata so it
1863
+ * is super fast. This Filter can therefor be used for indexing very large
1864
+ * datasets. Currently regular expressions are not supported. If you are
1865
+ * really interested in the feature, please contact me at dbalmain@gmail.com.
1866
+ *
1867
+ * == Example
1868
+ *
1869
+ * mapping = {
1870
+ * ['à','á','â','ã','ä','å','ā','ă'] => 'a',
1871
+ * 'æ' => 'ae',
1872
+ * ['ď','đ'] => 'd',
1873
+ * ['ç','ć','č','ĉ','ċ'] => 'c',
1874
+ * ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
1875
+ * ['ƒ'] => 'f',
1876
+ * ['ĝ','ğ','ġ','ģ'] => 'g',
1877
+ * ['ĥ','ħ'] => 'h',
1878
+ * ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
1879
+ * ['į','ı','ij','ĵ'] => 'j',
1880
+ * ['ķ','ĸ'] => 'k',
1881
+ * ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
1882
+ * ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
1883
+ * ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
1884
+ * ['œ'] => 'oek',
1885
+ * ['ą'] => 'q',
1886
+ * ['ŕ','ř','ŗ'] => 'r',
1887
+ * ['ś','š','ş','ŝ','ș'] => 's',
1888
+ * ['ť','ţ','ŧ','ț'] => 't',
1889
+ * ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
1890
+ * ['ŵ'] => 'w',
1891
+ * ['ý','ÿ','ŷ'] => 'y',
1892
+ * ['ž','ż','ź'] => 'z'
1893
+ * }
1894
+ * filt = MappingFilter.new(token_stream, mapping)
1895
+ */
1896
+ static void Init_MappingFilter(void)
1897
+ {
1898
+ cMappingFilter =
1899
+ rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
1900
+ frt_mark_cclass(cMappingFilter);
1901
+ rb_define_alloc_func(cMappingFilter, frt_data_alloc);
1902
+ rb_define_method(cMappingFilter, "initialize",
1903
+ frt_mapping_filter_init, 2);
1904
+ }
1905
+
1731
1906
  /*
1732
1907
  * Document-class: Ferret::Analysis::StopFilter
1733
1908
  *
@@ -1999,7 +2174,7 @@ static void Init_WhiteSpaceAnalyzer(void)
1999
2174
  * ascii-analyzers. If it were implemented in Ruby it would look like this;
2000
2175
  *
2001
2176
  * class AsciiStandardAnalyzer
2002
- * def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
2177
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2003
2178
  * @lower = lower
2004
2179
  * @stop_words = stop_words
2005
2180
  * end
@@ -2036,7 +2211,7 @@ static void Init_AsciiStandardAnalyzer(void)
2036
2211
  * it were implemented in Ruby it would look like this;
2037
2212
  *
2038
2213
  * class StandardAnalyzer
2039
- * def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
2214
+ * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
2040
2215
  * @lower = lower
2041
2216
  * @stop_words = stop_words
2042
2217
  * end
@@ -2131,6 +2306,8 @@ static void Init_RegExpAnalyzer(void)
2131
2306
  rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2132
2307
  rb_define_method(cRegExpAnalyzer, "initialize",
2133
2308
  frt_re_analyzer_init, -1);
2309
+ rb_define_method(cRegExpAnalyzer, "token_stream",
2310
+ frt_re_analyzer_token_stream, 2);
2134
2311
  }
2135
2312
 
2136
2313
  /* rdoc hack
@@ -2244,6 +2421,7 @@ Init_Analysis(void)
2244
2421
  Init_LowerCaseFilter();
2245
2422
  Init_HyphenFilter();
2246
2423
  Init_StopFilter();
2424
+ Init_MappingFilter();
2247
2425
  Init_StemFilter();
2248
2426
 
2249
2427
  Init_Analyzer();
data/ext/r_search.c CHANGED
@@ -124,7 +124,6 @@ extern VALUE cIndexReader;
124
124
  extern void frt_ir_free(void *p);
125
125
  extern void frt_ir_mark(void *p);
126
126
 
127
-
128
127
  extern void frt_set_term(VALUE rterm, Term *t);
129
128
  extern VALUE frt_get_analyzer(Analyzer *a);
130
129
  extern HashSet *frt_get_fields(VALUE rfields);
@@ -223,6 +222,113 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
223
222
  return rstr;
224
223
  }
225
224
 
225
+ /*
226
+ * Json Exportation - Loading each LazyDoc and formatting them into json
227
+ * This code is designed to get a VERY FAST json string, the goal was speed,
228
+ * not sexyness.
229
+ * Jeremie 'ahFeel' BORDIER
230
+ * ahFeel@rift.Fr
231
+ */
232
+ __inline char *
233
+ json_concat_string(char *s, char *field)
234
+ {
235
+ *(s++) = '"';
236
+ while (*field) {
237
+ if (*field == '\"') {
238
+ *(s++) = '\'';
239
+ *(s++) = *(field++);
240
+ *(s++) = '\'';
241
+ }
242
+ else {
243
+ *(s++) = *(field++);
244
+ }
245
+ }
246
+ *(s++) = '"';
247
+ return s;
248
+ }
249
+
250
+ inline char *
251
+ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
252
+ {
253
+ int i, j;
254
+ int diff = s - *str;
255
+ int len = diff, l;
256
+ LazyDocField *f;
257
+
258
+ for (i = 0; i < lzd->size; i++) {
259
+ f = lzd->fields[i];
260
+ /* 3 times length of field to make space for quoted quotes ('"') and
261
+ * 4 x field length to make space for '"' around fields and ','
262
+ * between fields. Add 100 for '[', ']' and good safety.
263
+ */
264
+ len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
265
+ }
266
+
267
+ if (len > *slen) {
268
+ while (len > *slen) *slen = *slen << 1;
269
+ REALLOC_N(*str, char, *slen);
270
+ s = *str + diff;
271
+ }
272
+
273
+ for (i = 0; i < lzd->size; i++) {
274
+ f = lzd->fields[i];
275
+ if (i) *(s++) = ',';
276
+ *(s++) = '"';
277
+ l = strlen(f->name);
278
+ memcpy(s, f->name, l);
279
+ s += l;
280
+ *(s++) = '"';
281
+ *(s++) = ':';
282
+ if (f->size > 1) *(s++) = '[';
283
+ for (j = 0; j < f->size; j++) {
284
+ if (j) *(s++) = ',';
285
+ s = json_concat_string(s, lazy_df_get_data(f, j));
286
+ }
287
+ if (f->size > 1) *(s++) = ']';
288
+ }
289
+ return s;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * top_doc.to_json() -> string
295
+ *
296
+ * Returns a json represention of the top_doc.
297
+ */
298
+ static VALUE
299
+ frt_td_to_json(VALUE self)
300
+ {
301
+ int i;
302
+ VALUE rhits = rb_funcall(self, id_hits, 0);
303
+ VALUE rhit;
304
+ LazyDoc *lzd;
305
+ Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
306
+ const int num_hits = RARRAY(rhits)->len;
307
+ int doc_id;
308
+ int len = 32768;
309
+ char *str = ALLOC_N(char, len);
310
+ char *s = str;
311
+ VALUE rstr;
312
+
313
+ *(s++) = '[';
314
+ for (i = 0; i < num_hits; i++) {
315
+ if (i) *(s++) = ',';
316
+ *(s++) = '{';
317
+ rhit = RARRAY(rhits)->ptr[i];
318
+ doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
319
+ lzd = sea->get_lazy_doc(sea, doc_id);
320
+ s = frt_lzd_load_to_json(lzd, &str, s, &len);
321
+ lazy_doc_close(lzd);
322
+ *(s++) = '}';
323
+ }
324
+ *(s++) = ']';
325
+ *(s++) = '\0';
326
+ rstr = rb_str_new2(str);
327
+ free(str);
328
+ return rstr;
329
+ }
330
+
331
+
226
332
  /****************************************************************************
227
333
  *
228
334
  * Explanation Methods
@@ -1901,6 +2007,7 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
1901
2007
  VALUE rval;
1902
2008
  int type = SORT_TYPE_AUTO;
1903
2009
  int is_reverse = false;
2010
+ char *field;
1904
2011
 
1905
2012
  if (rb_scan_args(argc, argv, "11", &rfield, &roptions) == 2) {
1906
2013
  if (Qnil != (rval = rb_hash_aref(roptions, sym_type))) {
@@ -1914,11 +2021,11 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
1914
2021
  }
1915
2022
  }
1916
2023
  if (NIL_P(rfield)) rb_raise(rb_eArgError, "must pass a valid field name");
1917
- rfield = rb_obj_as_string(rfield);
2024
+ field = frt_field(rfield);
1918
2025
 
1919
- sf = sort_field_new(RSTRING(rfield)->ptr, type, is_reverse);
1920
- if (sf->field == NULL && RSTRING(rfield)->ptr != NULL) {
1921
- sf->field = estrdup(RSTRING(rfield)->ptr);
2026
+ sf = sort_field_new(field, type, is_reverse);
2027
+ if (sf->field == NULL && field) {
2028
+ sf->field = estrdup(field);
1922
2029
  }
1923
2030
 
1924
2031
  Frt_Wrap_Struct(self, NULL, &frt_sf_free, sf);
@@ -2017,7 +2124,6 @@ frt_sort_free(void *p)
2017
2124
  {
2018
2125
  Sort *sort = (Sort *)p;
2019
2126
  object_del(sort);
2020
- object_del(sort->sort_fields);
2021
2127
  sort_destroy(sort);
2022
2128
  }
2023
2129
 
@@ -2025,7 +2131,10 @@ static void
2025
2131
  frt_sort_mark(void *p)
2026
2132
  {
2027
2133
  Sort *sort = (Sort *)p;
2028
- frt_gc_mark(sort->sort_fields);
2134
+ int i;
2135
+ for (i = 0; i < sort->size; i++) {
2136
+ frt_gc_mark(sort->sort_fields[i]);
2137
+ }
2029
2138
  }
2030
2139
 
2031
2140
  static VALUE
@@ -2147,11 +2256,6 @@ frt_sort_init(int argc, VALUE *argv, VALUE self)
2147
2256
  sort_add_sort_field(sort, (SortField *)&SORT_FIELD_SCORE);
2148
2257
  sort_add_sort_field(sort, (SortField *)&SORT_FIELD_DOC);
2149
2258
  }
2150
- rfields = rb_ary_new2(sort->size);
2151
- for (i = 0; i < sort->size; i++) {
2152
- rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
2153
- }
2154
- object_add(sort->sort_fields, rfields);
2155
2259
 
2156
2260
  return self;
2157
2261
  }
@@ -2166,7 +2270,12 @@ static VALUE
2166
2270
  frt_sort_get_fields(VALUE self)
2167
2271
  {
2168
2272
  GET_SORT();
2169
- return object_get(sort->sort_fields);
2273
+ VALUE rfields = rb_ary_new2(sort->size);
2274
+ int i;
2275
+ for (i = 0; i < sort->size; i++) {
2276
+ rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
2277
+ }
2278
+ return rfields;
2170
2279
  }
2171
2280
 
2172
2281
 
@@ -2374,9 +2483,9 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2374
2483
  sea->arg = (void *)rval;
2375
2484
  }
2376
2485
  if (Qnil != (rval = rb_hash_aref(roptions, sym_sort))) {
2377
- if (TYPE(rval) != T_DATA) {
2486
+ if (TYPE(rval) != T_DATA || CLASS_OF(rval) == cSortField) {
2378
2487
  rval = frt_sort_init(1, &rval, frt_sort_alloc(cSort));
2379
- }
2488
+ }
2380
2489
  Data_Get_Struct(rval, Sort, sort);
2381
2490
  }
2382
2491
  }
@@ -2801,6 +2910,7 @@ Init_TopDocs(void)
2801
2910
  rb_set_class_path(cTopDocs, mSearch, td_class);
2802
2911
  rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
2803
2912
  rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
2913
+ rb_define_method(cTopDocs, "to_json", frt_td_to_json, 0);
2804
2914
  id_hits = rb_intern("hits");
2805
2915
  id_total_hits = rb_intern("total_hits");
2806
2916
  id_max_score = rb_intern("max_score");