ferret 0.10.11 → 0.10.12
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/ext/multimapper.h
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#ifndef FRT_MAPPER_H
|
2
|
+
#define FRT_MAPPER_H
|
3
|
+
|
4
|
+
#include "hash.h"
|
5
|
+
|
6
|
+
typedef struct State
|
7
|
+
{
|
8
|
+
int (*next)(struct State *self, int c, int *states);
|
9
|
+
void (*destroy_i)(struct State *self);
|
10
|
+
int (*is_match)(struct State *self, char **mapping);
|
11
|
+
} State;
|
12
|
+
|
13
|
+
typedef struct DeterministicState
|
14
|
+
{
|
15
|
+
struct DeterministicState *next[256];
|
16
|
+
int longest_match;
|
17
|
+
char *mapping;
|
18
|
+
int mapping_len;
|
19
|
+
} DeterministicState;
|
20
|
+
|
21
|
+
typedef struct Mapping
|
22
|
+
{
|
23
|
+
char *pattern;
|
24
|
+
char *replacement;
|
25
|
+
} Mapping;
|
26
|
+
|
27
|
+
typedef struct MultiMapper
|
28
|
+
{
|
29
|
+
Mapping **mappings;
|
30
|
+
int size;
|
31
|
+
int capa;
|
32
|
+
DeterministicState **dstates;
|
33
|
+
int d_size;
|
34
|
+
int d_capa;
|
35
|
+
unsigned char alphabet[256];
|
36
|
+
int a_size;
|
37
|
+
HashTable *dstates_map;
|
38
|
+
State **nstates;
|
39
|
+
int nsize;
|
40
|
+
int *next_states;
|
41
|
+
int ref_cnt;
|
42
|
+
} MultiMapper;
|
43
|
+
|
44
|
+
extern MultiMapper *mulmap_new();
|
45
|
+
extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
|
46
|
+
extern void mulmap_compile(MultiMapper *self);
|
47
|
+
extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
|
48
|
+
extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
|
49
|
+
extern void mulmap_destroy(MultiMapper *self);
|
50
|
+
|
51
|
+
#endif
|
data/ext/r_analysis.c
CHANGED
@@ -20,6 +20,7 @@ static VALUE cRegExpTokenizer;
|
|
20
20
|
static VALUE cAsciiLowerCaseFilter;
|
21
21
|
static VALUE cLowerCaseFilter;
|
22
22
|
static VALUE cStopFilter;
|
23
|
+
static VALUE cMappingFilter;
|
23
24
|
static VALUE cHyphenFilter;
|
24
25
|
static VALUE cStemFilter;
|
25
26
|
|
@@ -48,13 +49,11 @@ static VALUE object_space;
|
|
48
49
|
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
|
49
50
|
int, struct re_registers *);
|
50
51
|
|
51
|
-
|
52
|
-
static int
|
52
|
+
int
|
53
53
|
frt_rb_hash_size(VALUE hash)
|
54
54
|
{
|
55
55
|
return RHASH(hash)->tbl->num_entries;
|
56
56
|
}
|
57
|
-
*/
|
58
57
|
|
59
58
|
/****************************************************************************
|
60
59
|
*
|
@@ -468,8 +467,8 @@ frt_ts_get_text(VALUE self)
|
|
468
467
|
VALUE rtext = Qnil;
|
469
468
|
TokenStream *ts;
|
470
469
|
Data_Get_Struct(self, TokenStream, ts);
|
471
|
-
if (ts->text) {
|
472
|
-
if (
|
470
|
+
if ((rtext = object_get(&ts->text)) == Qnil) {
|
471
|
+
if (ts->text) {
|
473
472
|
rtext = rb_str_new2(ts->text);
|
474
473
|
object_set(&ts->text, rtext);
|
475
474
|
}
|
@@ -539,7 +538,7 @@ typedef struct CWrappedTokenStream {
|
|
539
538
|
static void
|
540
539
|
cwrts_destroy_i(TokenStream *ts)
|
541
540
|
{
|
542
|
-
rb_hash_delete(object_space,
|
541
|
+
rb_hash_delete(object_space, ((long)ts)|1);
|
543
542
|
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
544
543
|
free(ts);
|
545
544
|
}
|
@@ -563,7 +562,8 @@ static TokenStream *
|
|
563
562
|
cwrts_clone_i(TokenStream *orig_ts)
|
564
563
|
{
|
565
564
|
TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
|
566
|
-
CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
|
565
|
+
VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
|
566
|
+
rb_hash_aset(object_space, ((long)new_ts)|1, rts);
|
567
567
|
return new_ts;
|
568
568
|
}
|
569
569
|
|
@@ -583,7 +583,7 @@ frt_get_cwrapped_rts(VALUE rts)
|
|
583
583
|
ts->clone_i = &cwrts_clone_i;
|
584
584
|
ts->destroy_i = &cwrts_destroy_i;
|
585
585
|
/* prevent from being garbage collected */
|
586
|
-
rb_hash_aset(object_space,
|
586
|
+
rb_hash_aset(object_space, ((long)ts)|1, rts);
|
587
587
|
ts->ref_cnt = 1;
|
588
588
|
}
|
589
589
|
return ts;
|
@@ -621,6 +621,8 @@ typedef struct RegExpTokenStream {
|
|
621
621
|
static void
|
622
622
|
rets_destroy_i(TokenStream *ts)
|
623
623
|
{
|
624
|
+
rb_hash_delete(object_space, ((long)ts)|1);
|
625
|
+
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
624
626
|
free(ts);
|
625
627
|
}
|
626
628
|
|
@@ -656,6 +658,7 @@ frt_rets_set_text(VALUE self, VALUE rtext)
|
|
656
658
|
TokenStream *ts;
|
657
659
|
GET_TS(ts, self);
|
658
660
|
|
661
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
659
662
|
StringValue(rtext);
|
660
663
|
RETS(ts)->rtext = rtext;
|
661
664
|
RETS(ts)->curr_ind = 0;
|
@@ -723,12 +726,12 @@ rets_clone_i(TokenStream *orig_ts)
|
|
723
726
|
static TokenStream *
|
724
727
|
rets_new(VALUE rtext, VALUE regex, VALUE proc)
|
725
728
|
{
|
726
|
-
TokenStream *ts;
|
729
|
+
TokenStream *ts = ts_new(RegExpTokenStream);
|
727
730
|
|
728
731
|
if (rtext != Qnil) {
|
729
732
|
rtext = StringValue(rtext);
|
733
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
730
734
|
}
|
731
|
-
ts = ts_new(RegExpTokenStream);
|
732
735
|
ts->reset = &rets_reset;
|
733
736
|
ts->next = &rets_next;
|
734
737
|
ts->clone_i = &rets_clone_i;
|
@@ -769,9 +772,6 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
|
|
769
772
|
|
770
773
|
Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
|
771
774
|
object_add(ts, self);
|
772
|
-
/* no need to add to object space as it is going to ruby space
|
773
|
-
* rb_hash_aset(object_space, LONG2NUM((long)self), self);
|
774
|
-
*/
|
775
775
|
return self;
|
776
776
|
}
|
777
777
|
|
@@ -973,6 +973,96 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
973
973
|
return self;
|
974
974
|
}
|
975
975
|
|
976
|
+
static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
|
977
|
+
{
|
978
|
+
switch (TYPE(from)) {
|
979
|
+
case T_STRING:
|
980
|
+
mapping_filter_add(mf, RSTRING(from)->ptr, to);
|
981
|
+
break;
|
982
|
+
case T_SYMBOL:
|
983
|
+
mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
|
984
|
+
break;
|
985
|
+
default:
|
986
|
+
rb_raise(rb_eArgError,
|
987
|
+
"cannot map from %s with MappingFilter",
|
988
|
+
RSTRING(rb_obj_as_string(from))->ptr);
|
989
|
+
break;
|
990
|
+
}
|
991
|
+
}
|
992
|
+
|
993
|
+
static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
994
|
+
{
|
995
|
+
if (key == Qundef) {
|
996
|
+
return ST_CONTINUE;
|
997
|
+
} else {
|
998
|
+
TokenStream *mf = (TokenStream *)arg;
|
999
|
+
char *to;
|
1000
|
+
switch (TYPE(value)) {
|
1001
|
+
case T_STRING:
|
1002
|
+
to = RSTRING(value)->ptr;
|
1003
|
+
break;
|
1004
|
+
case T_SYMBOL:
|
1005
|
+
to = rb_id2name(SYM2ID(value));
|
1006
|
+
break;
|
1007
|
+
default:
|
1008
|
+
rb_raise(rb_eArgError,
|
1009
|
+
"cannot map to %s with MappingFilter",
|
1010
|
+
RSTRING(rb_obj_as_string(key))->ptr);
|
1011
|
+
break;
|
1012
|
+
}
|
1013
|
+
if (TYPE(key) == T_ARRAY) {
|
1014
|
+
int i;
|
1015
|
+
for (i = RARRAY(key)->len - 1; i >= 0; i--) {
|
1016
|
+
frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
|
1017
|
+
}
|
1018
|
+
}
|
1019
|
+
else {
|
1020
|
+
frt_add_mapping_i(mf, key, to);
|
1021
|
+
}
|
1022
|
+
}
|
1023
|
+
return ST_CONTINUE;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
|
1027
|
+
/*
|
1028
|
+
* call-seq:
|
1029
|
+
* MappingFilter.new(token_stream, mapping) -> token_stream
|
1030
|
+
*
|
1031
|
+
* Create an MappingFilter which maps strings in tokens. This is usually used
|
1032
|
+
* to map UTF-8 characters to ascii characters for easier searching and
|
1033
|
+
* better searche recall. The mapping is compiled into a Deterministic Finite
|
1034
|
+
* Automata so it is super fast. This Filter can therefor be used for
|
1035
|
+
* indexing very large datasets. Currently regular expressions are not
|
1036
|
+
* supported. If you are really interested in the feature, please contact me
|
1037
|
+
* at dbalmain@gmail.com.
|
1038
|
+
*
|
1039
|
+
* token_stream:: TokenStream to be filtered
|
1040
|
+
* mapping:: Hash of mappings to apply to tokens. The key can be a
|
1041
|
+
* String or an Array of Strings. The value must be a String
|
1042
|
+
*
|
1043
|
+
* == Example
|
1044
|
+
*
|
1045
|
+
* filt = MappingFilter.new(token_stream,
|
1046
|
+
* {
|
1047
|
+
* ['à','á','â','ã','ä','å'] => 'a',
|
1048
|
+
* ['è','é','ê','ë','ē','ę'] => 'e'
|
1049
|
+
* })
|
1050
|
+
*/
|
1051
|
+
static VALUE
|
1052
|
+
frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
1053
|
+
{
|
1054
|
+
TokenStream *ts;
|
1055
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
1056
|
+
ts = mapping_filter_new(ts);
|
1057
|
+
rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
|
1058
|
+
mulmap_compile(((MappingFilter *)ts)->mapper);
|
1059
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
1060
|
+
|
1061
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
1062
|
+
object_add(ts, self);
|
1063
|
+
return self;
|
1064
|
+
}
|
1065
|
+
|
976
1066
|
/*
|
977
1067
|
* call-seq:
|
978
1068
|
* StemFilter.new(token_stream) -> token_stream
|
@@ -1031,7 +1121,7 @@ typedef struct CWrappedAnalyzer
|
|
1031
1121
|
static void
|
1032
1122
|
cwa_destroy_i(Analyzer *a)
|
1033
1123
|
{
|
1034
|
-
rb_hash_delete(object_space,
|
1124
|
+
rb_hash_delete(object_space, ((long)a)|1);
|
1035
1125
|
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
1036
1126
|
free(a);
|
1037
1127
|
}
|
@@ -1059,7 +1149,7 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
|
1059
1149
|
a->ref_cnt = 1;
|
1060
1150
|
((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
|
1061
1151
|
/* prevent from being garbage collected */
|
1062
|
-
rb_hash_aset(object_space,
|
1152
|
+
rb_hash_aset(object_space, ((long)a)|1, ranalyzer);
|
1063
1153
|
}
|
1064
1154
|
return a;
|
1065
1155
|
}
|
@@ -1100,6 +1190,8 @@ frt_get_analyzer(Analyzer *a)
|
|
1100
1190
|
static VALUE
|
1101
1191
|
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1102
1192
|
{
|
1193
|
+
/* NOTE: Any changes made to this method may also need to be applied to
|
1194
|
+
* frt_re_analyzer_token_stream */
|
1103
1195
|
TokenStream *ts;
|
1104
1196
|
Analyzer *a;
|
1105
1197
|
GET_A(a, self);
|
@@ -1121,7 +1213,7 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
1121
1213
|
|
1122
1214
|
/*
|
1123
1215
|
* call-seq:
|
1124
|
-
* AsciiWhiteSpaceAnalyzer.new(lower =
|
1216
|
+
* AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
|
1125
1217
|
*
|
1126
1218
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1127
1219
|
* but can optionally leave case as is. Lowercasing will only be done to
|
@@ -1142,7 +1234,7 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1142
1234
|
|
1143
1235
|
/*
|
1144
1236
|
* call-seq:
|
1145
|
-
* WhiteSpaceAnalyzer.new(lower =
|
1237
|
+
* WhiteSpaceAnalyzer.new(lower = false) -> analyzer
|
1146
1238
|
*
|
1147
1239
|
* Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
|
1148
1240
|
* optionally leave case as is. Lowercasing will be done based on the current
|
@@ -1220,7 +1312,7 @@ get_rstopwords(const char **stop_words)
|
|
1220
1312
|
|
1221
1313
|
/*
|
1222
1314
|
* call-seq:
|
1223
|
-
* AsciiStandardAnalyzer.new(lower = true, stop_words =
|
1315
|
+
* AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
|
1224
1316
|
* -> analyzer
|
1225
1317
|
*
|
1226
1318
|
* Create a new AsciiStandardAnalyzer which downcases tokens by default but
|
@@ -1253,7 +1345,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1253
1345
|
|
1254
1346
|
/*
|
1255
1347
|
* call-seq:
|
1256
|
-
* StandardAnalyzer.new(stop_words=
|
1348
|
+
* StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
|
1257
1349
|
* -> analyzer
|
1258
1350
|
*
|
1259
1351
|
* Create a new StandardAnalyzer which downcases tokens by default but can
|
@@ -1377,7 +1469,6 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1377
1469
|
|
1378
1470
|
ts = rets_new(Qnil, regex, proc);
|
1379
1471
|
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
1380
|
-
/* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
|
1381
1472
|
object_add(ts, rets);
|
1382
1473
|
|
1383
1474
|
if (lower != Qfalse) {
|
@@ -1392,6 +1483,41 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1392
1483
|
return self;
|
1393
1484
|
}
|
1394
1485
|
|
1486
|
+
/*
|
1487
|
+
* call-seq:
|
1488
|
+
* analyzer.token_stream(field_name, input) -> token_stream
|
1489
|
+
*
|
1490
|
+
* Create a new TokenStream to tokenize +input+. The TokenStream created may
|
1491
|
+
* also depend on the +field_name+. Although this parameter is typically
|
1492
|
+
* ignored.
|
1493
|
+
*
|
1494
|
+
* field_name:: name of the field to be tokenized
|
1495
|
+
* input:: data from the field to be tokenized
|
1496
|
+
*/
|
1497
|
+
static VALUE
|
1498
|
+
frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
|
1499
|
+
{
|
1500
|
+
TokenStream *ts;
|
1501
|
+
Analyzer *a;
|
1502
|
+
GET_A(a, self);
|
1503
|
+
|
1504
|
+
StringValue(rtext);
|
1505
|
+
|
1506
|
+
ts = a_get_ts(a, frt_field(rfield), RSTRING(rtext)->ptr);
|
1507
|
+
|
1508
|
+
/* Make sure that there is no entry already */
|
1509
|
+
object_set(&ts->text, rtext);
|
1510
|
+
if (ts->next == &rets_next) {
|
1511
|
+
RETS(ts)->rtext = rtext;
|
1512
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
1513
|
+
}
|
1514
|
+
else {
|
1515
|
+
RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
|
1516
|
+
rb_hash_aset(object_space, ((long)((TokenFilter*)ts)->sub_ts)|1, rtext);
|
1517
|
+
}
|
1518
|
+
return get_rb_token_stream(ts);
|
1519
|
+
}
|
1520
|
+
|
1395
1521
|
/****************************************************************************
|
1396
1522
|
*
|
1397
1523
|
* Locale stuff
|
@@ -1728,6 +1854,55 @@ static void Init_HyphenFilter(void)
|
|
1728
1854
|
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1729
1855
|
}
|
1730
1856
|
|
1857
|
+
/*
|
1858
|
+
* Document-class: Ferret::Analysis::MappingFilter
|
1859
|
+
*
|
1860
|
+
* A MappingFilter maps strings in tokens. This is usually used to map UTF-8
|
1861
|
+
* characters to ascii characters for easier searching and better searche
|
1862
|
+
* recall. The mapping is compiled into a Deterministic Finite Automata so it
|
1863
|
+
* is super fast. This Filter can therefor be used for indexing very large
|
1864
|
+
* datasets. Currently regular expressions are not supported. If you are
|
1865
|
+
* really interested in the feature, please contact me at dbalmain@gmail.com.
|
1866
|
+
*
|
1867
|
+
* == Example
|
1868
|
+
*
|
1869
|
+
* mapping = {
|
1870
|
+
* ['à','á','â','ã','ä','å','ā','ă'] => 'a',
|
1871
|
+
* 'æ' => 'ae',
|
1872
|
+
* ['ď','đ'] => 'd',
|
1873
|
+
* ['ç','ć','č','ĉ','ċ'] => 'c',
|
1874
|
+
* ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
|
1875
|
+
* ['ƒ'] => 'f',
|
1876
|
+
* ['ĝ','ğ','ġ','ģ'] => 'g',
|
1877
|
+
* ['ĥ','ħ'] => 'h',
|
1878
|
+
* ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
|
1879
|
+
* ['į','ı','ij','ĵ'] => 'j',
|
1880
|
+
* ['ķ','ĸ'] => 'k',
|
1881
|
+
* ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
|
1882
|
+
* ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
|
1883
|
+
* ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
|
1884
|
+
* ['œ'] => 'oek',
|
1885
|
+
* ['ą'] => 'q',
|
1886
|
+
* ['ŕ','ř','ŗ'] => 'r',
|
1887
|
+
* ['ś','š','ş','ŝ','ș'] => 's',
|
1888
|
+
* ['ť','ţ','ŧ','ț'] => 't',
|
1889
|
+
* ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
|
1890
|
+
* ['ŵ'] => 'w',
|
1891
|
+
* ['ý','ÿ','ŷ'] => 'y',
|
1892
|
+
* ['ž','ż','ź'] => 'z'
|
1893
|
+
* }
|
1894
|
+
* filt = MappingFilter.new(token_stream, mapping)
|
1895
|
+
*/
|
1896
|
+
static void Init_MappingFilter(void)
|
1897
|
+
{
|
1898
|
+
cMappingFilter =
|
1899
|
+
rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
|
1900
|
+
frt_mark_cclass(cMappingFilter);
|
1901
|
+
rb_define_alloc_func(cMappingFilter, frt_data_alloc);
|
1902
|
+
rb_define_method(cMappingFilter, "initialize",
|
1903
|
+
frt_mapping_filter_init, 2);
|
1904
|
+
}
|
1905
|
+
|
1731
1906
|
/*
|
1732
1907
|
* Document-class: Ferret::Analysis::StopFilter
|
1733
1908
|
*
|
@@ -1999,7 +2174,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1999
2174
|
* ascii-analyzers. If it were implemented in Ruby it would look like this;
|
2000
2175
|
*
|
2001
2176
|
* class AsciiStandardAnalyzer
|
2002
|
-
* def initialize(stop_words =
|
2177
|
+
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
2003
2178
|
* @lower = lower
|
2004
2179
|
* @stop_words = stop_words
|
2005
2180
|
* end
|
@@ -2036,7 +2211,7 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2036
2211
|
* it were implemented in Ruby it would look like this;
|
2037
2212
|
*
|
2038
2213
|
* class StandardAnalyzer
|
2039
|
-
* def initialize(stop_words =
|
2214
|
+
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
2040
2215
|
* @lower = lower
|
2041
2216
|
* @stop_words = stop_words
|
2042
2217
|
* end
|
@@ -2131,6 +2306,8 @@ static void Init_RegExpAnalyzer(void)
|
|
2131
2306
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2132
2307
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2133
2308
|
frt_re_analyzer_init, -1);
|
2309
|
+
rb_define_method(cRegExpAnalyzer, "token_stream",
|
2310
|
+
frt_re_analyzer_token_stream, 2);
|
2134
2311
|
}
|
2135
2312
|
|
2136
2313
|
/* rdoc hack
|
@@ -2244,6 +2421,7 @@ Init_Analysis(void)
|
|
2244
2421
|
Init_LowerCaseFilter();
|
2245
2422
|
Init_HyphenFilter();
|
2246
2423
|
Init_StopFilter();
|
2424
|
+
Init_MappingFilter();
|
2247
2425
|
Init_StemFilter();
|
2248
2426
|
|
2249
2427
|
Init_Analyzer();
|
data/ext/r_search.c
CHANGED
@@ -124,7 +124,6 @@ extern VALUE cIndexReader;
|
|
124
124
|
extern void frt_ir_free(void *p);
|
125
125
|
extern void frt_ir_mark(void *p);
|
126
126
|
|
127
|
-
|
128
127
|
extern void frt_set_term(VALUE rterm, Term *t);
|
129
128
|
extern VALUE frt_get_analyzer(Analyzer *a);
|
130
129
|
extern HashSet *frt_get_fields(VALUE rfields);
|
@@ -223,6 +222,113 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
223
222
|
return rstr;
|
224
223
|
}
|
225
224
|
|
225
|
+
/*
|
226
|
+
* Json Exportation - Loading each LazyDoc and formatting them into json
|
227
|
+
* This code is designed to get a VERY FAST json string, the goal was speed,
|
228
|
+
* not sexyness.
|
229
|
+
* Jeremie 'ahFeel' BORDIER
|
230
|
+
* ahFeel@rift.Fr
|
231
|
+
*/
|
232
|
+
__inline char *
|
233
|
+
json_concat_string(char *s, char *field)
|
234
|
+
{
|
235
|
+
*(s++) = '"';
|
236
|
+
while (*field) {
|
237
|
+
if (*field == '\"') {
|
238
|
+
*(s++) = '\'';
|
239
|
+
*(s++) = *(field++);
|
240
|
+
*(s++) = '\'';
|
241
|
+
}
|
242
|
+
else {
|
243
|
+
*(s++) = *(field++);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
*(s++) = '"';
|
247
|
+
return s;
|
248
|
+
}
|
249
|
+
|
250
|
+
inline char *
|
251
|
+
frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
252
|
+
{
|
253
|
+
int i, j;
|
254
|
+
int diff = s - *str;
|
255
|
+
int len = diff, l;
|
256
|
+
LazyDocField *f;
|
257
|
+
|
258
|
+
for (i = 0; i < lzd->size; i++) {
|
259
|
+
f = lzd->fields[i];
|
260
|
+
/* 3 times length of field to make space for quoted quotes ('"') and
|
261
|
+
* 4 x field length to make space for '"' around fields and ','
|
262
|
+
* between fields. Add 100 for '[', ']' and good safety.
|
263
|
+
*/
|
264
|
+
len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
|
265
|
+
}
|
266
|
+
|
267
|
+
if (len > *slen) {
|
268
|
+
while (len > *slen) *slen = *slen << 1;
|
269
|
+
REALLOC_N(*str, char, *slen);
|
270
|
+
s = *str + diff;
|
271
|
+
}
|
272
|
+
|
273
|
+
for (i = 0; i < lzd->size; i++) {
|
274
|
+
f = lzd->fields[i];
|
275
|
+
if (i) *(s++) = ',';
|
276
|
+
*(s++) = '"';
|
277
|
+
l = strlen(f->name);
|
278
|
+
memcpy(s, f->name, l);
|
279
|
+
s += l;
|
280
|
+
*(s++) = '"';
|
281
|
+
*(s++) = ':';
|
282
|
+
if (f->size > 1) *(s++) = '[';
|
283
|
+
for (j = 0; j < f->size; j++) {
|
284
|
+
if (j) *(s++) = ',';
|
285
|
+
s = json_concat_string(s, lazy_df_get_data(f, j));
|
286
|
+
}
|
287
|
+
if (f->size > 1) *(s++) = ']';
|
288
|
+
}
|
289
|
+
return s;
|
290
|
+
}
|
291
|
+
|
292
|
+
/*
|
293
|
+
* call-seq:
|
294
|
+
* top_doc.to_json() -> string
|
295
|
+
*
|
296
|
+
* Returns a json represention of the top_doc.
|
297
|
+
*/
|
298
|
+
static VALUE
|
299
|
+
frt_td_to_json(VALUE self)
|
300
|
+
{
|
301
|
+
int i;
|
302
|
+
VALUE rhits = rb_funcall(self, id_hits, 0);
|
303
|
+
VALUE rhit;
|
304
|
+
LazyDoc *lzd;
|
305
|
+
Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
|
306
|
+
const int num_hits = RARRAY(rhits)->len;
|
307
|
+
int doc_id;
|
308
|
+
int len = 32768;
|
309
|
+
char *str = ALLOC_N(char, len);
|
310
|
+
char *s = str;
|
311
|
+
VALUE rstr;
|
312
|
+
|
313
|
+
*(s++) = '[';
|
314
|
+
for (i = 0; i < num_hits; i++) {
|
315
|
+
if (i) *(s++) = ',';
|
316
|
+
*(s++) = '{';
|
317
|
+
rhit = RARRAY(rhits)->ptr[i];
|
318
|
+
doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
|
319
|
+
lzd = sea->get_lazy_doc(sea, doc_id);
|
320
|
+
s = frt_lzd_load_to_json(lzd, &str, s, &len);
|
321
|
+
lazy_doc_close(lzd);
|
322
|
+
*(s++) = '}';
|
323
|
+
}
|
324
|
+
*(s++) = ']';
|
325
|
+
*(s++) = '\0';
|
326
|
+
rstr = rb_str_new2(str);
|
327
|
+
free(str);
|
328
|
+
return rstr;
|
329
|
+
}
|
330
|
+
|
331
|
+
|
226
332
|
/****************************************************************************
|
227
333
|
*
|
228
334
|
* Explanation Methods
|
@@ -1901,6 +2007,7 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
|
|
1901
2007
|
VALUE rval;
|
1902
2008
|
int type = SORT_TYPE_AUTO;
|
1903
2009
|
int is_reverse = false;
|
2010
|
+
char *field;
|
1904
2011
|
|
1905
2012
|
if (rb_scan_args(argc, argv, "11", &rfield, &roptions) == 2) {
|
1906
2013
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_type))) {
|
@@ -1914,11 +2021,11 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
|
|
1914
2021
|
}
|
1915
2022
|
}
|
1916
2023
|
if (NIL_P(rfield)) rb_raise(rb_eArgError, "must pass a valid field name");
|
1917
|
-
|
2024
|
+
field = frt_field(rfield);
|
1918
2025
|
|
1919
|
-
sf = sort_field_new(
|
1920
|
-
if (sf->field == NULL &&
|
1921
|
-
sf->field = estrdup(
|
2026
|
+
sf = sort_field_new(field, type, is_reverse);
|
2027
|
+
if (sf->field == NULL && field) {
|
2028
|
+
sf->field = estrdup(field);
|
1922
2029
|
}
|
1923
2030
|
|
1924
2031
|
Frt_Wrap_Struct(self, NULL, &frt_sf_free, sf);
|
@@ -2017,7 +2124,6 @@ frt_sort_free(void *p)
|
|
2017
2124
|
{
|
2018
2125
|
Sort *sort = (Sort *)p;
|
2019
2126
|
object_del(sort);
|
2020
|
-
object_del(sort->sort_fields);
|
2021
2127
|
sort_destroy(sort);
|
2022
2128
|
}
|
2023
2129
|
|
@@ -2025,7 +2131,10 @@ static void
|
|
2025
2131
|
frt_sort_mark(void *p)
|
2026
2132
|
{
|
2027
2133
|
Sort *sort = (Sort *)p;
|
2028
|
-
|
2134
|
+
int i;
|
2135
|
+
for (i = 0; i < sort->size; i++) {
|
2136
|
+
frt_gc_mark(sort->sort_fields[i]);
|
2137
|
+
}
|
2029
2138
|
}
|
2030
2139
|
|
2031
2140
|
static VALUE
|
@@ -2147,11 +2256,6 @@ frt_sort_init(int argc, VALUE *argv, VALUE self)
|
|
2147
2256
|
sort_add_sort_field(sort, (SortField *)&SORT_FIELD_SCORE);
|
2148
2257
|
sort_add_sort_field(sort, (SortField *)&SORT_FIELD_DOC);
|
2149
2258
|
}
|
2150
|
-
rfields = rb_ary_new2(sort->size);
|
2151
|
-
for (i = 0; i < sort->size; i++) {
|
2152
|
-
rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
|
2153
|
-
}
|
2154
|
-
object_add(sort->sort_fields, rfields);
|
2155
2259
|
|
2156
2260
|
return self;
|
2157
2261
|
}
|
@@ -2166,7 +2270,12 @@ static VALUE
|
|
2166
2270
|
frt_sort_get_fields(VALUE self)
|
2167
2271
|
{
|
2168
2272
|
GET_SORT();
|
2169
|
-
|
2273
|
+
VALUE rfields = rb_ary_new2(sort->size);
|
2274
|
+
int i;
|
2275
|
+
for (i = 0; i < sort->size; i++) {
|
2276
|
+
rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
|
2277
|
+
}
|
2278
|
+
return rfields;
|
2170
2279
|
}
|
2171
2280
|
|
2172
2281
|
|
@@ -2374,9 +2483,9 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2374
2483
|
sea->arg = (void *)rval;
|
2375
2484
|
}
|
2376
2485
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_sort))) {
|
2377
|
-
if (TYPE(rval) != T_DATA) {
|
2486
|
+
if (TYPE(rval) != T_DATA || CLASS_OF(rval) == cSortField) {
|
2378
2487
|
rval = frt_sort_init(1, &rval, frt_sort_alloc(cSort));
|
2379
|
-
}
|
2488
|
+
}
|
2380
2489
|
Data_Get_Struct(rval, Sort, sort);
|
2381
2490
|
}
|
2382
2491
|
}
|
@@ -2801,6 +2910,7 @@ Init_TopDocs(void)
|
|
2801
2910
|
rb_set_class_path(cTopDocs, mSearch, td_class);
|
2802
2911
|
rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
|
2803
2912
|
rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
|
2913
|
+
rb_define_method(cTopDocs, "to_json", frt_td_to_json, 0);
|
2804
2914
|
id_hits = rb_intern("hits");
|
2805
2915
|
id_total_hits = rb_intern("total_hits");
|
2806
2916
|
id_max_score = rb_intern("max_score");
|