ferret 0.10.11 → 0.10.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/Rakefile +1 -1
- data/ext/analysis.c +62 -11
- data/ext/analysis.h +11 -0
- data/ext/bitvector.c +29 -18
- data/ext/{defines.h → config.h} +0 -0
- data/ext/except.h +1 -1
- data/ext/extconf.rb +2 -1
- data/ext/fs_store.c +4 -2
- data/ext/global.h +1 -1
- data/ext/hash.c +15 -12
- data/ext/hash.h +1 -0
- data/ext/helper.c +2 -2
- data/ext/helper.h +1 -1
- data/ext/index.c +4 -2
- data/ext/index.h +2 -2
- data/ext/{mem_pool.c → mempool.c} +1 -1
- data/ext/{mem_pool.h → mempool.h} +0 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/r_analysis.c +200 -22
- data/ext/r_search.c +125 -15
- data/ext/search.c +1 -1
- data/ext/sort.c +1 -1
- data/ext/stopwords.c +2 -3
- data/lib/ferret/index.rb +2 -1
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +62 -0
- data/test/unit/index/tc_index.rb +19 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +7 -0
- metadata +9 -7
data/ext/multimapper.h
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#ifndef FRT_MAPPER_H
|
2
|
+
#define FRT_MAPPER_H
|
3
|
+
|
4
|
+
#include "hash.h"
|
5
|
+
|
6
|
+
typedef struct State
|
7
|
+
{
|
8
|
+
int (*next)(struct State *self, int c, int *states);
|
9
|
+
void (*destroy_i)(struct State *self);
|
10
|
+
int (*is_match)(struct State *self, char **mapping);
|
11
|
+
} State;
|
12
|
+
|
13
|
+
typedef struct DeterministicState
|
14
|
+
{
|
15
|
+
struct DeterministicState *next[256];
|
16
|
+
int longest_match;
|
17
|
+
char *mapping;
|
18
|
+
int mapping_len;
|
19
|
+
} DeterministicState;
|
20
|
+
|
21
|
+
typedef struct Mapping
|
22
|
+
{
|
23
|
+
char *pattern;
|
24
|
+
char *replacement;
|
25
|
+
} Mapping;
|
26
|
+
|
27
|
+
typedef struct MultiMapper
|
28
|
+
{
|
29
|
+
Mapping **mappings;
|
30
|
+
int size;
|
31
|
+
int capa;
|
32
|
+
DeterministicState **dstates;
|
33
|
+
int d_size;
|
34
|
+
int d_capa;
|
35
|
+
unsigned char alphabet[256];
|
36
|
+
int a_size;
|
37
|
+
HashTable *dstates_map;
|
38
|
+
State **nstates;
|
39
|
+
int nsize;
|
40
|
+
int *next_states;
|
41
|
+
int ref_cnt;
|
42
|
+
} MultiMapper;
|
43
|
+
|
44
|
+
extern MultiMapper *mulmap_new();
|
45
|
+
extern void mulmap_add_mapping(MultiMapper *self, const char *p, const char *r);
|
46
|
+
extern void mulmap_compile(MultiMapper *self);
|
47
|
+
extern char *mulmap_map(MultiMapper *self, char *to, char *from, int capa);
|
48
|
+
extern int mulmap_map_len(MultiMapper *self, char *to, char *from, int capa);
|
49
|
+
extern void mulmap_destroy(MultiMapper *self);
|
50
|
+
|
51
|
+
#endif
|
data/ext/r_analysis.c
CHANGED
@@ -20,6 +20,7 @@ static VALUE cRegExpTokenizer;
|
|
20
20
|
static VALUE cAsciiLowerCaseFilter;
|
21
21
|
static VALUE cLowerCaseFilter;
|
22
22
|
static VALUE cStopFilter;
|
23
|
+
static VALUE cMappingFilter;
|
23
24
|
static VALUE cHyphenFilter;
|
24
25
|
static VALUE cStemFilter;
|
25
26
|
|
@@ -48,13 +49,11 @@ static VALUE object_space;
|
|
48
49
|
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
|
49
50
|
int, struct re_registers *);
|
50
51
|
|
51
|
-
|
52
|
-
static int
|
52
|
+
int
|
53
53
|
frt_rb_hash_size(VALUE hash)
|
54
54
|
{
|
55
55
|
return RHASH(hash)->tbl->num_entries;
|
56
56
|
}
|
57
|
-
*/
|
58
57
|
|
59
58
|
/****************************************************************************
|
60
59
|
*
|
@@ -468,8 +467,8 @@ frt_ts_get_text(VALUE self)
|
|
468
467
|
VALUE rtext = Qnil;
|
469
468
|
TokenStream *ts;
|
470
469
|
Data_Get_Struct(self, TokenStream, ts);
|
471
|
-
if (ts->text) {
|
472
|
-
if (
|
470
|
+
if ((rtext = object_get(&ts->text)) == Qnil) {
|
471
|
+
if (ts->text) {
|
473
472
|
rtext = rb_str_new2(ts->text);
|
474
473
|
object_set(&ts->text, rtext);
|
475
474
|
}
|
@@ -539,7 +538,7 @@ typedef struct CWrappedTokenStream {
|
|
539
538
|
static void
|
540
539
|
cwrts_destroy_i(TokenStream *ts)
|
541
540
|
{
|
542
|
-
rb_hash_delete(object_space,
|
541
|
+
rb_hash_delete(object_space, ((long)ts)|1);
|
543
542
|
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
544
543
|
free(ts);
|
545
544
|
}
|
@@ -563,7 +562,8 @@ static TokenStream *
|
|
563
562
|
cwrts_clone_i(TokenStream *orig_ts)
|
564
563
|
{
|
565
564
|
TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
|
566
|
-
CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
|
565
|
+
VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
|
566
|
+
rb_hash_aset(object_space, ((long)new_ts)|1, rts);
|
567
567
|
return new_ts;
|
568
568
|
}
|
569
569
|
|
@@ -583,7 +583,7 @@ frt_get_cwrapped_rts(VALUE rts)
|
|
583
583
|
ts->clone_i = &cwrts_clone_i;
|
584
584
|
ts->destroy_i = &cwrts_destroy_i;
|
585
585
|
/* prevent from being garbage collected */
|
586
|
-
rb_hash_aset(object_space,
|
586
|
+
rb_hash_aset(object_space, ((long)ts)|1, rts);
|
587
587
|
ts->ref_cnt = 1;
|
588
588
|
}
|
589
589
|
return ts;
|
@@ -621,6 +621,8 @@ typedef struct RegExpTokenStream {
|
|
621
621
|
static void
|
622
622
|
rets_destroy_i(TokenStream *ts)
|
623
623
|
{
|
624
|
+
rb_hash_delete(object_space, ((long)ts)|1);
|
625
|
+
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
624
626
|
free(ts);
|
625
627
|
}
|
626
628
|
|
@@ -656,6 +658,7 @@ frt_rets_set_text(VALUE self, VALUE rtext)
|
|
656
658
|
TokenStream *ts;
|
657
659
|
GET_TS(ts, self);
|
658
660
|
|
661
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
659
662
|
StringValue(rtext);
|
660
663
|
RETS(ts)->rtext = rtext;
|
661
664
|
RETS(ts)->curr_ind = 0;
|
@@ -723,12 +726,12 @@ rets_clone_i(TokenStream *orig_ts)
|
|
723
726
|
static TokenStream *
|
724
727
|
rets_new(VALUE rtext, VALUE regex, VALUE proc)
|
725
728
|
{
|
726
|
-
TokenStream *ts;
|
729
|
+
TokenStream *ts = ts_new(RegExpTokenStream);
|
727
730
|
|
728
731
|
if (rtext != Qnil) {
|
729
732
|
rtext = StringValue(rtext);
|
733
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
730
734
|
}
|
731
|
-
ts = ts_new(RegExpTokenStream);
|
732
735
|
ts->reset = &rets_reset;
|
733
736
|
ts->next = &rets_next;
|
734
737
|
ts->clone_i = &rets_clone_i;
|
@@ -769,9 +772,6 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
|
|
769
772
|
|
770
773
|
Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
|
771
774
|
object_add(ts, self);
|
772
|
-
/* no need to add to object space as it is going to ruby space
|
773
|
-
* rb_hash_aset(object_space, LONG2NUM((long)self), self);
|
774
|
-
*/
|
775
775
|
return self;
|
776
776
|
}
|
777
777
|
|
@@ -973,6 +973,96 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
973
973
|
return self;
|
974
974
|
}
|
975
975
|
|
976
|
+
static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
|
977
|
+
{
|
978
|
+
switch (TYPE(from)) {
|
979
|
+
case T_STRING:
|
980
|
+
mapping_filter_add(mf, RSTRING(from)->ptr, to);
|
981
|
+
break;
|
982
|
+
case T_SYMBOL:
|
983
|
+
mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
|
984
|
+
break;
|
985
|
+
default:
|
986
|
+
rb_raise(rb_eArgError,
|
987
|
+
"cannot map from %s with MappingFilter",
|
988
|
+
RSTRING(rb_obj_as_string(from))->ptr);
|
989
|
+
break;
|
990
|
+
}
|
991
|
+
}
|
992
|
+
|
993
|
+
static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
994
|
+
{
|
995
|
+
if (key == Qundef) {
|
996
|
+
return ST_CONTINUE;
|
997
|
+
} else {
|
998
|
+
TokenStream *mf = (TokenStream *)arg;
|
999
|
+
char *to;
|
1000
|
+
switch (TYPE(value)) {
|
1001
|
+
case T_STRING:
|
1002
|
+
to = RSTRING(value)->ptr;
|
1003
|
+
break;
|
1004
|
+
case T_SYMBOL:
|
1005
|
+
to = rb_id2name(SYM2ID(value));
|
1006
|
+
break;
|
1007
|
+
default:
|
1008
|
+
rb_raise(rb_eArgError,
|
1009
|
+
"cannot map to %s with MappingFilter",
|
1010
|
+
RSTRING(rb_obj_as_string(key))->ptr);
|
1011
|
+
break;
|
1012
|
+
}
|
1013
|
+
if (TYPE(key) == T_ARRAY) {
|
1014
|
+
int i;
|
1015
|
+
for (i = RARRAY(key)->len - 1; i >= 0; i--) {
|
1016
|
+
frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
|
1017
|
+
}
|
1018
|
+
}
|
1019
|
+
else {
|
1020
|
+
frt_add_mapping_i(mf, key, to);
|
1021
|
+
}
|
1022
|
+
}
|
1023
|
+
return ST_CONTINUE;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
|
1027
|
+
/*
|
1028
|
+
* call-seq:
|
1029
|
+
* MappingFilter.new(token_stream, mapping) -> token_stream
|
1030
|
+
*
|
1031
|
+
* Create an MappingFilter which maps strings in tokens. This is usually used
|
1032
|
+
* to map UTF-8 characters to ascii characters for easier searching and
|
1033
|
+
* better searche recall. The mapping is compiled into a Deterministic Finite
|
1034
|
+
* Automata so it is super fast. This Filter can therefor be used for
|
1035
|
+
* indexing very large datasets. Currently regular expressions are not
|
1036
|
+
* supported. If you are really interested in the feature, please contact me
|
1037
|
+
* at dbalmain@gmail.com.
|
1038
|
+
*
|
1039
|
+
* token_stream:: TokenStream to be filtered
|
1040
|
+
* mapping:: Hash of mappings to apply to tokens. The key can be a
|
1041
|
+
* String or an Array of Strings. The value must be a String
|
1042
|
+
*
|
1043
|
+
* == Example
|
1044
|
+
*
|
1045
|
+
* filt = MappingFilter.new(token_stream,
|
1046
|
+
* {
|
1047
|
+
* ['à','á','â','ã','ä','å'] => 'a',
|
1048
|
+
* ['è','é','ê','ë','ē','ę'] => 'e'
|
1049
|
+
* })
|
1050
|
+
*/
|
1051
|
+
static VALUE
|
1052
|
+
frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
1053
|
+
{
|
1054
|
+
TokenStream *ts;
|
1055
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
1056
|
+
ts = mapping_filter_new(ts);
|
1057
|
+
rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
|
1058
|
+
mulmap_compile(((MappingFilter *)ts)->mapper);
|
1059
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
1060
|
+
|
1061
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
1062
|
+
object_add(ts, self);
|
1063
|
+
return self;
|
1064
|
+
}
|
1065
|
+
|
976
1066
|
/*
|
977
1067
|
* call-seq:
|
978
1068
|
* StemFilter.new(token_stream) -> token_stream
|
@@ -1031,7 +1121,7 @@ typedef struct CWrappedAnalyzer
|
|
1031
1121
|
static void
|
1032
1122
|
cwa_destroy_i(Analyzer *a)
|
1033
1123
|
{
|
1034
|
-
rb_hash_delete(object_space,
|
1124
|
+
rb_hash_delete(object_space, ((long)a)|1);
|
1035
1125
|
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
1036
1126
|
free(a);
|
1037
1127
|
}
|
@@ -1059,7 +1149,7 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
|
1059
1149
|
a->ref_cnt = 1;
|
1060
1150
|
((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
|
1061
1151
|
/* prevent from being garbage collected */
|
1062
|
-
rb_hash_aset(object_space,
|
1152
|
+
rb_hash_aset(object_space, ((long)a)|1, ranalyzer);
|
1063
1153
|
}
|
1064
1154
|
return a;
|
1065
1155
|
}
|
@@ -1100,6 +1190,8 @@ frt_get_analyzer(Analyzer *a)
|
|
1100
1190
|
static VALUE
|
1101
1191
|
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1102
1192
|
{
|
1193
|
+
/* NOTE: Any changes made to this method may also need to be applied to
|
1194
|
+
* frt_re_analyzer_token_stream */
|
1103
1195
|
TokenStream *ts;
|
1104
1196
|
Analyzer *a;
|
1105
1197
|
GET_A(a, self);
|
@@ -1121,7 +1213,7 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
1121
1213
|
|
1122
1214
|
/*
|
1123
1215
|
* call-seq:
|
1124
|
-
* AsciiWhiteSpaceAnalyzer.new(lower =
|
1216
|
+
* AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
|
1125
1217
|
*
|
1126
1218
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1127
1219
|
* but can optionally leave case as is. Lowercasing will only be done to
|
@@ -1142,7 +1234,7 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1142
1234
|
|
1143
1235
|
/*
|
1144
1236
|
* call-seq:
|
1145
|
-
* WhiteSpaceAnalyzer.new(lower =
|
1237
|
+
* WhiteSpaceAnalyzer.new(lower = false) -> analyzer
|
1146
1238
|
*
|
1147
1239
|
* Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
|
1148
1240
|
* optionally leave case as is. Lowercasing will be done based on the current
|
@@ -1220,7 +1312,7 @@ get_rstopwords(const char **stop_words)
|
|
1220
1312
|
|
1221
1313
|
/*
|
1222
1314
|
* call-seq:
|
1223
|
-
* AsciiStandardAnalyzer.new(lower = true, stop_words =
|
1315
|
+
* AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
|
1224
1316
|
* -> analyzer
|
1225
1317
|
*
|
1226
1318
|
* Create a new AsciiStandardAnalyzer which downcases tokens by default but
|
@@ -1253,7 +1345,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1253
1345
|
|
1254
1346
|
/*
|
1255
1347
|
* call-seq:
|
1256
|
-
* StandardAnalyzer.new(stop_words=
|
1348
|
+
* StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
|
1257
1349
|
* -> analyzer
|
1258
1350
|
*
|
1259
1351
|
* Create a new StandardAnalyzer which downcases tokens by default but can
|
@@ -1377,7 +1469,6 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1377
1469
|
|
1378
1470
|
ts = rets_new(Qnil, regex, proc);
|
1379
1471
|
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
1380
|
-
/* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
|
1381
1472
|
object_add(ts, rets);
|
1382
1473
|
|
1383
1474
|
if (lower != Qfalse) {
|
@@ -1392,6 +1483,41 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1392
1483
|
return self;
|
1393
1484
|
}
|
1394
1485
|
|
1486
|
+
/*
|
1487
|
+
* call-seq:
|
1488
|
+
* analyzer.token_stream(field_name, input) -> token_stream
|
1489
|
+
*
|
1490
|
+
* Create a new TokenStream to tokenize +input+. The TokenStream created may
|
1491
|
+
* also depend on the +field_name+. Although this parameter is typically
|
1492
|
+
* ignored.
|
1493
|
+
*
|
1494
|
+
* field_name:: name of the field to be tokenized
|
1495
|
+
* input:: data from the field to be tokenized
|
1496
|
+
*/
|
1497
|
+
static VALUE
|
1498
|
+
frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
|
1499
|
+
{
|
1500
|
+
TokenStream *ts;
|
1501
|
+
Analyzer *a;
|
1502
|
+
GET_A(a, self);
|
1503
|
+
|
1504
|
+
StringValue(rtext);
|
1505
|
+
|
1506
|
+
ts = a_get_ts(a, frt_field(rfield), RSTRING(rtext)->ptr);
|
1507
|
+
|
1508
|
+
/* Make sure that there is no entry already */
|
1509
|
+
object_set(&ts->text, rtext);
|
1510
|
+
if (ts->next == &rets_next) {
|
1511
|
+
RETS(ts)->rtext = rtext;
|
1512
|
+
rb_hash_aset(object_space, ((long)ts)|1, rtext);
|
1513
|
+
}
|
1514
|
+
else {
|
1515
|
+
RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
|
1516
|
+
rb_hash_aset(object_space, ((long)((TokenFilter*)ts)->sub_ts)|1, rtext);
|
1517
|
+
}
|
1518
|
+
return get_rb_token_stream(ts);
|
1519
|
+
}
|
1520
|
+
|
1395
1521
|
/****************************************************************************
|
1396
1522
|
*
|
1397
1523
|
* Locale stuff
|
@@ -1728,6 +1854,55 @@ static void Init_HyphenFilter(void)
|
|
1728
1854
|
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1729
1855
|
}
|
1730
1856
|
|
1857
|
+
/*
|
1858
|
+
* Document-class: Ferret::Analysis::MappingFilter
|
1859
|
+
*
|
1860
|
+
* A MappingFilter maps strings in tokens. This is usually used to map UTF-8
|
1861
|
+
* characters to ascii characters for easier searching and better searche
|
1862
|
+
* recall. The mapping is compiled into a Deterministic Finite Automata so it
|
1863
|
+
* is super fast. This Filter can therefor be used for indexing very large
|
1864
|
+
* datasets. Currently regular expressions are not supported. If you are
|
1865
|
+
* really interested in the feature, please contact me at dbalmain@gmail.com.
|
1866
|
+
*
|
1867
|
+
* == Example
|
1868
|
+
*
|
1869
|
+
* mapping = {
|
1870
|
+
* ['à','á','â','ã','ä','å','ā','ă'] => 'a',
|
1871
|
+
* 'æ' => 'ae',
|
1872
|
+
* ['ď','đ'] => 'd',
|
1873
|
+
* ['ç','ć','č','ĉ','ċ'] => 'c',
|
1874
|
+
* ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
|
1875
|
+
* ['ƒ'] => 'f',
|
1876
|
+
* ['ĝ','ğ','ġ','ģ'] => 'g',
|
1877
|
+
* ['ĥ','ħ'] => 'h',
|
1878
|
+
* ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
|
1879
|
+
* ['į','ı','ij','ĵ'] => 'j',
|
1880
|
+
* ['ķ','ĸ'] => 'k',
|
1881
|
+
* ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
|
1882
|
+
* ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
|
1883
|
+
* ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
|
1884
|
+
* ['œ'] => 'oek',
|
1885
|
+
* ['ą'] => 'q',
|
1886
|
+
* ['ŕ','ř','ŗ'] => 'r',
|
1887
|
+
* ['ś','š','ş','ŝ','ș'] => 's',
|
1888
|
+
* ['ť','ţ','ŧ','ț'] => 't',
|
1889
|
+
* ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
|
1890
|
+
* ['ŵ'] => 'w',
|
1891
|
+
* ['ý','ÿ','ŷ'] => 'y',
|
1892
|
+
* ['ž','ż','ź'] => 'z'
|
1893
|
+
* }
|
1894
|
+
* filt = MappingFilter.new(token_stream, mapping)
|
1895
|
+
*/
|
1896
|
+
static void Init_MappingFilter(void)
|
1897
|
+
{
|
1898
|
+
cMappingFilter =
|
1899
|
+
rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
|
1900
|
+
frt_mark_cclass(cMappingFilter);
|
1901
|
+
rb_define_alloc_func(cMappingFilter, frt_data_alloc);
|
1902
|
+
rb_define_method(cMappingFilter, "initialize",
|
1903
|
+
frt_mapping_filter_init, 2);
|
1904
|
+
}
|
1905
|
+
|
1731
1906
|
/*
|
1732
1907
|
* Document-class: Ferret::Analysis::StopFilter
|
1733
1908
|
*
|
@@ -1999,7 +2174,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1999
2174
|
* ascii-analyzers. If it were implemented in Ruby it would look like this;
|
2000
2175
|
*
|
2001
2176
|
* class AsciiStandardAnalyzer
|
2002
|
-
* def initialize(stop_words =
|
2177
|
+
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
2003
2178
|
* @lower = lower
|
2004
2179
|
* @stop_words = stop_words
|
2005
2180
|
* end
|
@@ -2036,7 +2211,7 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2036
2211
|
* it were implemented in Ruby it would look like this;
|
2037
2212
|
*
|
2038
2213
|
* class StandardAnalyzer
|
2039
|
-
* def initialize(stop_words =
|
2214
|
+
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
2040
2215
|
* @lower = lower
|
2041
2216
|
* @stop_words = stop_words
|
2042
2217
|
* end
|
@@ -2131,6 +2306,8 @@ static void Init_RegExpAnalyzer(void)
|
|
2131
2306
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2132
2307
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2133
2308
|
frt_re_analyzer_init, -1);
|
2309
|
+
rb_define_method(cRegExpAnalyzer, "token_stream",
|
2310
|
+
frt_re_analyzer_token_stream, 2);
|
2134
2311
|
}
|
2135
2312
|
|
2136
2313
|
/* rdoc hack
|
@@ -2244,6 +2421,7 @@ Init_Analysis(void)
|
|
2244
2421
|
Init_LowerCaseFilter();
|
2245
2422
|
Init_HyphenFilter();
|
2246
2423
|
Init_StopFilter();
|
2424
|
+
Init_MappingFilter();
|
2247
2425
|
Init_StemFilter();
|
2248
2426
|
|
2249
2427
|
Init_Analyzer();
|
data/ext/r_search.c
CHANGED
@@ -124,7 +124,6 @@ extern VALUE cIndexReader;
|
|
124
124
|
extern void frt_ir_free(void *p);
|
125
125
|
extern void frt_ir_mark(void *p);
|
126
126
|
|
127
|
-
|
128
127
|
extern void frt_set_term(VALUE rterm, Term *t);
|
129
128
|
extern VALUE frt_get_analyzer(Analyzer *a);
|
130
129
|
extern HashSet *frt_get_fields(VALUE rfields);
|
@@ -223,6 +222,113 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
223
222
|
return rstr;
|
224
223
|
}
|
225
224
|
|
225
|
+
/*
|
226
|
+
* Json Exportation - Loading each LazyDoc and formatting them into json
|
227
|
+
* This code is designed to get a VERY FAST json string, the goal was speed,
|
228
|
+
* not sexyness.
|
229
|
+
* Jeremie 'ahFeel' BORDIER
|
230
|
+
* ahFeel@rift.Fr
|
231
|
+
*/
|
232
|
+
__inline char *
|
233
|
+
json_concat_string(char *s, char *field)
|
234
|
+
{
|
235
|
+
*(s++) = '"';
|
236
|
+
while (*field) {
|
237
|
+
if (*field == '\"') {
|
238
|
+
*(s++) = '\'';
|
239
|
+
*(s++) = *(field++);
|
240
|
+
*(s++) = '\'';
|
241
|
+
}
|
242
|
+
else {
|
243
|
+
*(s++) = *(field++);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
*(s++) = '"';
|
247
|
+
return s;
|
248
|
+
}
|
249
|
+
|
250
|
+
inline char *
|
251
|
+
frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
252
|
+
{
|
253
|
+
int i, j;
|
254
|
+
int diff = s - *str;
|
255
|
+
int len = diff, l;
|
256
|
+
LazyDocField *f;
|
257
|
+
|
258
|
+
for (i = 0; i < lzd->size; i++) {
|
259
|
+
f = lzd->fields[i];
|
260
|
+
/* 3 times length of field to make space for quoted quotes ('"') and
|
261
|
+
* 4 x field length to make space for '"' around fields and ','
|
262
|
+
* between fields. Add 100 for '[', ']' and good safety.
|
263
|
+
*/
|
264
|
+
len += strlen(f->name) + f->len * 3 + 100 + 4 * f->size;
|
265
|
+
}
|
266
|
+
|
267
|
+
if (len > *slen) {
|
268
|
+
while (len > *slen) *slen = *slen << 1;
|
269
|
+
REALLOC_N(*str, char, *slen);
|
270
|
+
s = *str + diff;
|
271
|
+
}
|
272
|
+
|
273
|
+
for (i = 0; i < lzd->size; i++) {
|
274
|
+
f = lzd->fields[i];
|
275
|
+
if (i) *(s++) = ',';
|
276
|
+
*(s++) = '"';
|
277
|
+
l = strlen(f->name);
|
278
|
+
memcpy(s, f->name, l);
|
279
|
+
s += l;
|
280
|
+
*(s++) = '"';
|
281
|
+
*(s++) = ':';
|
282
|
+
if (f->size > 1) *(s++) = '[';
|
283
|
+
for (j = 0; j < f->size; j++) {
|
284
|
+
if (j) *(s++) = ',';
|
285
|
+
s = json_concat_string(s, lazy_df_get_data(f, j));
|
286
|
+
}
|
287
|
+
if (f->size > 1) *(s++) = ']';
|
288
|
+
}
|
289
|
+
return s;
|
290
|
+
}
|
291
|
+
|
292
|
+
/*
|
293
|
+
* call-seq:
|
294
|
+
* top_doc.to_json() -> string
|
295
|
+
*
|
296
|
+
* Returns a json represention of the top_doc.
|
297
|
+
*/
|
298
|
+
static VALUE
|
299
|
+
frt_td_to_json(VALUE self)
|
300
|
+
{
|
301
|
+
int i;
|
302
|
+
VALUE rhits = rb_funcall(self, id_hits, 0);
|
303
|
+
VALUE rhit;
|
304
|
+
LazyDoc *lzd;
|
305
|
+
Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
|
306
|
+
const int num_hits = RARRAY(rhits)->len;
|
307
|
+
int doc_id;
|
308
|
+
int len = 32768;
|
309
|
+
char *str = ALLOC_N(char, len);
|
310
|
+
char *s = str;
|
311
|
+
VALUE rstr;
|
312
|
+
|
313
|
+
*(s++) = '[';
|
314
|
+
for (i = 0; i < num_hits; i++) {
|
315
|
+
if (i) *(s++) = ',';
|
316
|
+
*(s++) = '{';
|
317
|
+
rhit = RARRAY(rhits)->ptr[i];
|
318
|
+
doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
|
319
|
+
lzd = sea->get_lazy_doc(sea, doc_id);
|
320
|
+
s = frt_lzd_load_to_json(lzd, &str, s, &len);
|
321
|
+
lazy_doc_close(lzd);
|
322
|
+
*(s++) = '}';
|
323
|
+
}
|
324
|
+
*(s++) = ']';
|
325
|
+
*(s++) = '\0';
|
326
|
+
rstr = rb_str_new2(str);
|
327
|
+
free(str);
|
328
|
+
return rstr;
|
329
|
+
}
|
330
|
+
|
331
|
+
|
226
332
|
/****************************************************************************
|
227
333
|
*
|
228
334
|
* Explanation Methods
|
@@ -1901,6 +2007,7 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
|
|
1901
2007
|
VALUE rval;
|
1902
2008
|
int type = SORT_TYPE_AUTO;
|
1903
2009
|
int is_reverse = false;
|
2010
|
+
char *field;
|
1904
2011
|
|
1905
2012
|
if (rb_scan_args(argc, argv, "11", &rfield, &roptions) == 2) {
|
1906
2013
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_type))) {
|
@@ -1914,11 +2021,11 @@ frt_sf_init(int argc, VALUE *argv, VALUE self)
|
|
1914
2021
|
}
|
1915
2022
|
}
|
1916
2023
|
if (NIL_P(rfield)) rb_raise(rb_eArgError, "must pass a valid field name");
|
1917
|
-
|
2024
|
+
field = frt_field(rfield);
|
1918
2025
|
|
1919
|
-
sf = sort_field_new(
|
1920
|
-
if (sf->field == NULL &&
|
1921
|
-
sf->field = estrdup(
|
2026
|
+
sf = sort_field_new(field, type, is_reverse);
|
2027
|
+
if (sf->field == NULL && field) {
|
2028
|
+
sf->field = estrdup(field);
|
1922
2029
|
}
|
1923
2030
|
|
1924
2031
|
Frt_Wrap_Struct(self, NULL, &frt_sf_free, sf);
|
@@ -2017,7 +2124,6 @@ frt_sort_free(void *p)
|
|
2017
2124
|
{
|
2018
2125
|
Sort *sort = (Sort *)p;
|
2019
2126
|
object_del(sort);
|
2020
|
-
object_del(sort->sort_fields);
|
2021
2127
|
sort_destroy(sort);
|
2022
2128
|
}
|
2023
2129
|
|
@@ -2025,7 +2131,10 @@ static void
|
|
2025
2131
|
frt_sort_mark(void *p)
|
2026
2132
|
{
|
2027
2133
|
Sort *sort = (Sort *)p;
|
2028
|
-
|
2134
|
+
int i;
|
2135
|
+
for (i = 0; i < sort->size; i++) {
|
2136
|
+
frt_gc_mark(sort->sort_fields[i]);
|
2137
|
+
}
|
2029
2138
|
}
|
2030
2139
|
|
2031
2140
|
static VALUE
|
@@ -2147,11 +2256,6 @@ frt_sort_init(int argc, VALUE *argv, VALUE self)
|
|
2147
2256
|
sort_add_sort_field(sort, (SortField *)&SORT_FIELD_SCORE);
|
2148
2257
|
sort_add_sort_field(sort, (SortField *)&SORT_FIELD_DOC);
|
2149
2258
|
}
|
2150
|
-
rfields = rb_ary_new2(sort->size);
|
2151
|
-
for (i = 0; i < sort->size; i++) {
|
2152
|
-
rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
|
2153
|
-
}
|
2154
|
-
object_add(sort->sort_fields, rfields);
|
2155
2259
|
|
2156
2260
|
return self;
|
2157
2261
|
}
|
@@ -2166,7 +2270,12 @@ static VALUE
|
|
2166
2270
|
frt_sort_get_fields(VALUE self)
|
2167
2271
|
{
|
2168
2272
|
GET_SORT();
|
2169
|
-
|
2273
|
+
VALUE rfields = rb_ary_new2(sort->size);
|
2274
|
+
int i;
|
2275
|
+
for (i = 0; i < sort->size; i++) {
|
2276
|
+
rb_ary_store(rfields, i, object_get(sort->sort_fields[i]));
|
2277
|
+
}
|
2278
|
+
return rfields;
|
2170
2279
|
}
|
2171
2280
|
|
2172
2281
|
|
@@ -2374,9 +2483,9 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2374
2483
|
sea->arg = (void *)rval;
|
2375
2484
|
}
|
2376
2485
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_sort))) {
|
2377
|
-
if (TYPE(rval) != T_DATA) {
|
2486
|
+
if (TYPE(rval) != T_DATA || CLASS_OF(rval) == cSortField) {
|
2378
2487
|
rval = frt_sort_init(1, &rval, frt_sort_alloc(cSort));
|
2379
|
-
}
|
2488
|
+
}
|
2380
2489
|
Data_Get_Struct(rval, Sort, sort);
|
2381
2490
|
}
|
2382
2491
|
}
|
@@ -2801,6 +2910,7 @@ Init_TopDocs(void)
|
|
2801
2910
|
rb_set_class_path(cTopDocs, mSearch, td_class);
|
2802
2911
|
rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
|
2803
2912
|
rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
|
2913
|
+
rb_define_method(cTopDocs, "to_json", frt_td_to_json, 0);
|
2804
2914
|
id_hits = rb_intern("hits");
|
2805
2915
|
id_total_hits = rb_intern("total_hits");
|
2806
2916
|
id_max_score = rb_intern("max_score");
|