ferret 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +23 -5
- data/TODO +2 -1
- data/ext/analysis.c +838 -177
- data/ext/analysis.h +55 -7
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/array.c +8 -5
- data/ext/compound_io.c +132 -96
- data/ext/document.c +58 -28
- data/ext/except.c +59 -0
- data/ext/except.h +88 -0
- data/ext/ferret.c +47 -3
- data/ext/ferret.h +3 -0
- data/ext/field.c +15 -9
- data/ext/filter.c +1 -1
- data/ext/fs_store.c +215 -34
- data/ext/global.c +72 -3
- data/ext/global.h +4 -3
- data/ext/hash.c +44 -3
- data/ext/hash.h +9 -0
- data/ext/header.h +58 -0
- data/ext/inc/except.h +88 -0
- data/ext/inc/lang.h +23 -13
- data/ext/ind.c +16 -10
- data/ext/index.h +2 -22
- data/ext/index_io.c +3 -11
- data/ext/index_rw.c +245 -193
- data/ext/lang.h +23 -13
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/modules.h +162 -0
- data/ext/q_boolean.c +34 -21
- data/ext/q_const_score.c +6 -12
- data/ext/q_filtered_query.c +206 -0
- data/ext/q_fuzzy.c +18 -15
- data/ext/q_match_all.c +3 -7
- data/ext/q_multi_phrase.c +10 -14
- data/ext/q_parser.c +29 -2
- data/ext/q_phrase.c +14 -21
- data/ext/q_prefix.c +15 -12
- data/ext/q_range.c +30 -28
- data/ext/q_span.c +13 -21
- data/ext/q_term.c +17 -26
- data/ext/r_analysis.c +693 -21
- data/ext/r_doc.c +11 -12
- data/ext/r_index_io.c +4 -1
- data/ext/r_qparser.c +21 -2
- data/ext/r_search.c +285 -18
- data/ext/ram_store.c +5 -2
- data/ext/search.c +11 -17
- data/ext/search.h +21 -45
- data/ext/similarity.h +67 -0
- data/ext/sort.c +30 -25
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stopwords.c +325 -0
- data/ext/store.c +34 -2
- data/ext/tags +2953 -0
- data/ext/term.c +21 -15
- data/ext/termdocs.c +5 -3
- data/ext/utilities.c +446 -0
- data/ext/vector.c +27 -13
- data/lib/ferret/document/document.rb +1 -1
- data/lib/ferret/index/index.rb +44 -6
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
- data/lib/rferret.rb +2 -1
- data/test/test_helper.rb +2 -2
- data/test/unit/analysis/ctc_analyzer.rb +401 -0
- data/test/unit/analysis/ctc_tokenstream.rb +423 -0
- data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
- data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
- data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
- data/test/unit/analysis/tc_analyzer.rb +1 -2
- data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
- data/test/unit/document/rtc_field.rb +28 -0
- data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
- data/test/unit/document/tc_field.rb +82 -12
- data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
- data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
- data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
- data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
- data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
- data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
- data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
- data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
- data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
- data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
- data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
- data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
- data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
- data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
- data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
- data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
- data/test/unit/query_parser/tc_query_parser.rb +24 -16
- data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
- data/test/unit/search/rtc_sort_field.rb +14 -0
- data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
- data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
- data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
- data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
- data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +20 -7
- data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
- data/test/unit/store/rtc_fs_store.rb +62 -0
- data/test/unit/store/rtc_ram_store.rb +15 -0
- data/test/unit/store/rtm_store.rb +150 -0
- data/test/unit/store/rtm_store_lock.rb +2 -0
- data/test/unit/store/tc_fs_store.rb +54 -40
- data/test/unit/store/tc_ram_store.rb +20 -0
- data/test/unit/store/tm_store.rb +30 -146
- data/test/unit/store/tm_store_lock.rb +66 -0
- data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
- data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
- data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
- data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
- data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
- data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
- data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
- data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
- metadata +360 -289
- data/test/unit/document/c_field.rb +0 -98
- data/test/unit/search/c_sort_field.rb +0 -27
- data/test/unit/store/c_fs_store.rb +0 -76
- data/test/unit/store/c_ram_store.rb +0 -35
- data/test/unit/store/m_store.rb +0 -34
- data/test/unit/store/m_store_lock.rb +0 -68
data/ext/q_prefix.c
CHANGED
@@ -17,7 +17,7 @@ char *prq_to_s(Query *self, char *field)
|
|
17
17
|
|
18
18
|
if (strcmp(term->field, field) != 0) {
|
19
19
|
sprintf(bptr, "%s:", term->field);
|
20
|
-
bptr
|
20
|
+
bptr += strlen(bptr);
|
21
21
|
}
|
22
22
|
sprintf(bptr, "%s*", term->text);
|
23
23
|
if (self->boost != 1.0) {
|
@@ -38,17 +38,20 @@ Query *prq_rewrite(Query *self, IndexReader *ir)
|
|
38
38
|
Query *tq;
|
39
39
|
Query *bq = bq_create(true);
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
41
|
+
TRY
|
42
|
+
do {
|
43
|
+
TermBuffer *tb = te->tb_curr;
|
44
|
+
if (!tb || strcmp(tb->field, prefix_field) != 0 ||
|
45
|
+
strncmp(tb->text, prefix_text, prefix_length) != 0) {
|
46
|
+
break;
|
47
|
+
}
|
48
|
+
tq = tq_create(term_create(tb->field, tb->text)); // found a match
|
49
|
+
tq->boost = self->boost; // set the boost
|
50
|
+
bq_add_query(bq, tq, BC_SHOULD); // add to query
|
51
|
+
} while (te->next(te));
|
52
|
+
XFINALLY
|
53
|
+
te->close(te);
|
54
|
+
XENDTRY
|
52
55
|
|
53
56
|
if (self->rewritten) self->rewritten->destroy(self->rewritten);
|
54
57
|
return self->rewritten = bq;
|
data/ext/q_range.c
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
3
|
|
4
|
+
static char * const NIL_BOUNDS_ERROR_MSG = "At least one value must be non-nil";
|
5
|
+
static char * const LOWER_BOUND_ERROR_MSG = "The lower bound must be non-nil to be inclusive";
|
6
|
+
static char * const UPPER_BOUND_ERROR_MSG = "The upper bound must be non-nil to be inclusive";
|
7
|
+
static char * const BOUND_ORDER_ERROR_MSG = "The lower bound must less than the upper bound";
|
8
|
+
|
4
9
|
/*****************************************************************************
|
5
10
|
*
|
6
11
|
* Range
|
@@ -9,51 +14,50 @@
|
|
9
14
|
|
10
15
|
char *range_to_s(Range *range, char *field, float boost)
|
11
16
|
{
|
12
|
-
char *buffer, *
|
17
|
+
char *buffer, *b;
|
13
18
|
int flen, llen, ulen;
|
14
19
|
|
15
20
|
flen = strlen(range->field);
|
16
21
|
llen = range->lower_term ? strlen(range->lower_term) : 0;
|
17
22
|
ulen = range->upper_term ? strlen(range->upper_term) : 0;
|
18
23
|
buffer = ALLOC_N(char, flen + llen + ulen + 40);
|
19
|
-
|
24
|
+
b = buffer;
|
20
25
|
|
21
26
|
if (strcmp(field, range->field)) {
|
22
27
|
memcpy(buffer, range->field, flen * sizeof(char));
|
23
|
-
|
24
|
-
*
|
25
|
-
|
28
|
+
b += flen;
|
29
|
+
*b = ':';
|
30
|
+
b++;
|
26
31
|
}
|
27
32
|
|
28
33
|
if (range->lower_term) {
|
29
|
-
*
|
30
|
-
|
31
|
-
memcpy(
|
32
|
-
|
34
|
+
*b = range->include_lower ? '[' : '{';
|
35
|
+
b++;
|
36
|
+
memcpy(b, range->lower_term, llen);
|
37
|
+
b += llen;
|
33
38
|
} else {
|
34
|
-
*
|
35
|
-
|
39
|
+
*b = '<';
|
40
|
+
b++;
|
36
41
|
}
|
37
42
|
|
38
43
|
if (range->upper_term && range->lower_term) {
|
39
|
-
*
|
44
|
+
*b = ' '; b++;
|
40
45
|
}
|
41
46
|
|
42
47
|
if (range->upper_term) {
|
43
|
-
memcpy(
|
44
|
-
|
45
|
-
*
|
46
|
-
|
48
|
+
memcpy(b, range->upper_term, ulen);
|
49
|
+
b += ulen;
|
50
|
+
*b = range->include_upper ? ']' : '}';
|
51
|
+
b++;
|
47
52
|
} else {
|
48
|
-
*
|
49
|
-
|
53
|
+
*b = '>';
|
54
|
+
b++;
|
50
55
|
}
|
51
56
|
|
52
|
-
*
|
57
|
+
*b = 0;
|
53
58
|
if (boost != 1.0) {
|
54
|
-
|
55
|
-
dbl_to_s(
|
56
|
-
sprintf(bptr, "^%s", dbuf);
|
59
|
+
*b = '^';
|
60
|
+
dbl_to_s(b + 1, boost);
|
57
61
|
}
|
58
62
|
return buffer;
|
59
63
|
}
|
@@ -73,15 +77,13 @@ Range *range_create(const char *field, char *lower_term, char *upper_term,
|
|
73
77
|
Range *range;
|
74
78
|
|
75
79
|
if (!lower_term && !upper_term)
|
76
|
-
|
80
|
+
RAISE(ARG_ERROR, NIL_BOUNDS_ERROR_MSG);
|
77
81
|
if (include_lower && !lower_term)
|
78
|
-
|
82
|
+
RAISE(ARG_ERROR, LOWER_BOUND_ERROR_MSG);
|
79
83
|
if (include_upper && !upper_term)
|
80
|
-
|
84
|
+
RAISE(ARG_ERROR, UPPER_BOUND_ERROR_MSG);
|
81
85
|
if (upper_term && lower_term && (strcmp(upper_term, lower_term) < 0))
|
82
|
-
|
83
|
-
"The lower bound must less than the upper bound, %s > %s",
|
84
|
-
upper_term, upper_term);
|
86
|
+
RAISE(ARG_ERROR, BOUND_ORDER_ERROR_MSG);
|
85
87
|
|
86
88
|
range = ALLOC(Range);
|
87
89
|
|
data/ext/q_span.c
CHANGED
@@ -39,21 +39,19 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
|
|
39
39
|
}
|
40
40
|
|
41
41
|
Explanation *expl = expl_create(0.0,
|
42
|
-
|
43
|
-
strlen(query_str) + 20,
|
44
|
-
query_str, target));
|
42
|
+
strfmt("weight(%s in %d), product of:", query_str, target));
|
45
43
|
|
46
44
|
/* We need two of these as it's included in both the query explanation
|
47
45
|
* and the field explanation */
|
48
46
|
Explanation *idf_expl1 = expl_create(self->idf,
|
49
|
-
|
47
|
+
strfmt("idf(%s: %s)", field, doc_freqs));
|
50
48
|
Explanation *idf_expl2 = expl_create(self->idf,
|
51
|
-
|
49
|
+
strfmt("idf(%s: %s)", field, doc_freqs));
|
52
50
|
if (terms->size > 0) free(doc_freqs); /* only free if allocated */
|
53
51
|
|
54
52
|
/* explain query weight */
|
55
53
|
Explanation *query_expl = expl_create(0.0,
|
56
|
-
|
54
|
+
strfmt("query_weight(%s), product of:", query_str));
|
57
55
|
|
58
56
|
if (self->query->boost != 1.0) {
|
59
57
|
expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
|
@@ -70,9 +68,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
|
|
70
68
|
|
71
69
|
/* explain field weight */
|
72
70
|
Explanation *field_expl = expl_create(0.0,
|
73
|
-
|
74
|
-
strlen(field) + strlen(query_str) + 20,
|
75
|
-
field, query_str, target));
|
71
|
+
strfmt("field_weight(%s:%s in %d), product of:", field, query_str, target));
|
76
72
|
free(query_str);
|
77
73
|
|
78
74
|
Scorer *scorer = self->scorer(self, ir);
|
@@ -84,8 +80,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
|
|
84
80
|
uchar *field_norms = ir->get_norms(ir, field);
|
85
81
|
float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[target]) : 0.0);
|
86
82
|
Explanation *field_norm_expl = expl_create(field_norm,
|
87
|
-
|
88
|
-
strlen(field) + 20, field, target));
|
83
|
+
strfmt("field_norm(field=%s, doc=%d)", field, target));
|
89
84
|
expl_add_detail(field_expl, field_norm_expl);
|
90
85
|
|
91
86
|
field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
|
@@ -103,9 +98,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
|
|
103
98
|
|
104
99
|
char *spanw_to_s(Weight *self)
|
105
100
|
{
|
106
|
-
|
107
|
-
dbl_to_s(dbuf, self->value);
|
108
|
-
return epstrdup("SpanWeight(%s)", strlen(dbuf), dbuf);
|
101
|
+
return strfmt("SpanWeight(%f)", self->value);
|
109
102
|
}
|
110
103
|
|
111
104
|
void spanw_destroy(void *p)
|
@@ -317,7 +310,7 @@ char *spanfe_to_s(SpanEnum *self)
|
|
317
310
|
{
|
318
311
|
char *field = ((SpanQuery *)self->query->data)->field;
|
319
312
|
char *query_str = self->query->to_s(self->query, field);
|
320
|
-
char *res =
|
313
|
+
char *res = strfmt("SpanFirstEnum(%s)", query_str);
|
321
314
|
free(query_str);
|
322
315
|
return res;
|
323
316
|
}
|
@@ -888,7 +881,7 @@ char *spanxe_to_s(SpanEnum *self)
|
|
888
881
|
{
|
889
882
|
char *field = ((SpanQuery *)self->query->data)->field;
|
890
883
|
char *query_str = self->query->to_s(self->query, field);
|
891
|
-
char *res =
|
884
|
+
char *res = strfmt("SpanNotEnum(%s)", query_str);
|
892
885
|
free(query_str);
|
893
886
|
return res;
|
894
887
|
}
|
@@ -954,7 +947,7 @@ char *spantq_to_s(Query *self, char *field)
|
|
954
947
|
} else {
|
955
948
|
term_str = term_to_s(term);
|
956
949
|
}
|
957
|
-
res =
|
950
|
+
res = strfmt("span_term(%s)", term_str);
|
958
951
|
free(term_str);
|
959
952
|
return res;
|
960
953
|
}
|
@@ -1015,7 +1008,7 @@ char *spanfq_to_s(Query *self, char *field)
|
|
1015
1008
|
SpanFirstQuery *sfq = (SpanFirstQuery *)((SpanQuery *)self->data)->data;
|
1016
1009
|
Query *match = sfq->match;
|
1017
1010
|
char *q_str = match->to_s(match, field);
|
1018
|
-
char *res =
|
1011
|
+
char *res = strfmt("span_first(%s, %d)", q_str, sfq->end);
|
1019
1012
|
free(q_str);
|
1020
1013
|
return res;
|
1021
1014
|
}
|
@@ -1372,8 +1365,7 @@ char *spanxq_to_s(Query *self, char *field)
|
|
1372
1365
|
SpanNotQuery *sxq = (SpanNotQuery *)((SpanQuery *)self->data)->data;
|
1373
1366
|
char *inc_s = sxq->inc->to_s(sxq->inc, field);
|
1374
1367
|
char *exc_s = sxq->exc->to_s(sxq->exc, field);
|
1375
|
-
char *res =
|
1376
|
-
strlen(inc_s) + strlen(exc_s), inc_s, exc_s);
|
1368
|
+
char *res = strfmt("span_not(inc:<%s>, exc:<%s>)", inc_s, exc_s);
|
1377
1369
|
|
1378
1370
|
free(inc_s);
|
1379
1371
|
free(exc_s);
|
@@ -1526,7 +1518,7 @@ Explanation *spansc_explain(Scorer *self, int target)
|
|
1526
1518
|
phrase_freq = (self->doc == target) ? spansc->freq : 0.0;
|
1527
1519
|
|
1528
1520
|
Explanation *tf_explanation = expl_create(sim_tf(self->similarity, phrase_freq),
|
1529
|
-
|
1521
|
+
strfmt("tf(phrase_freq(%f)", phrase_freq));
|
1530
1522
|
|
1531
1523
|
return tf_explanation;
|
1532
1524
|
}
|
data/ext/q_term.c
CHANGED
@@ -24,20 +24,18 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
24
24
|
char *field_name = term->field;
|
25
25
|
|
26
26
|
Explanation *expl = expl_create(0.0,
|
27
|
-
|
28
|
-
strlen(query_str) + 20,
|
29
|
-
query_str, doc_num));
|
27
|
+
strfmt("weight(%s in %d), product of:", query_str, doc_num));
|
30
28
|
|
31
29
|
// We need two of these as it's included in both the query explanation
|
32
30
|
// and the field explanation
|
33
31
|
Explanation *idf_expl1 = expl_create(self->idf,
|
34
|
-
|
32
|
+
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
35
33
|
Explanation *idf_expl2 = expl_create(self->idf,
|
36
|
-
|
34
|
+
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
37
35
|
|
38
36
|
// explain query weight
|
39
37
|
Explanation *query_expl = expl_create(0.0,
|
40
|
-
|
38
|
+
strfmt("query_weight(%s), product of:", query_str));
|
41
39
|
free(query_str);
|
42
40
|
|
43
41
|
if (self->query->boost != 1.0) {
|
@@ -55,8 +53,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
55
53
|
|
56
54
|
// explain field weight
|
57
55
|
Explanation *field_expl = expl_create(0.0,
|
58
|
-
|
59
|
-
strlen(field_name) + strlen(term->text) + 20,
|
56
|
+
strfmt("field_weight(%s:%s in %d), product of:",
|
60
57
|
field_name, term->text, doc_num));
|
61
58
|
|
62
59
|
Scorer *scorer = self->scorer(self, ir);
|
@@ -68,8 +65,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
68
65
|
uchar *field_norms = ir->get_norms(ir, field_name);
|
69
66
|
float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[doc_num]) : 0.0);
|
70
67
|
Explanation *field_norm_expl = expl_create(field_norm,
|
71
|
-
|
72
|
-
strlen(field_name) + 20, field_name, doc_num));
|
68
|
+
strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
|
73
69
|
expl_add_detail(field_expl, field_norm_expl);
|
74
70
|
|
75
71
|
field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
|
@@ -87,9 +83,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
87
83
|
|
88
84
|
char *tw_to_s(Weight *self)
|
89
85
|
{
|
90
|
-
|
91
|
-
dbl_to_s(dbuf, self->value);
|
92
|
-
return epstrdup("TermWeight(%#.5g)", strlen(dbuf), dbuf);
|
86
|
+
return strfmt("TermWeight(%f)", self->value);
|
93
87
|
}
|
94
88
|
|
95
89
|
void tw_destroy(void *p)
|
@@ -141,19 +135,18 @@ char *tq_to_s(Query *self, char *field)
|
|
141
135
|
int flen = strlen(term->field);
|
142
136
|
int tlen = strlen(term->text);
|
143
137
|
char *buffer = ALLOC_N(char, 34 + flen + tlen);
|
144
|
-
char *
|
138
|
+
char *b = buffer;
|
145
139
|
if (strcmp(field, term->field) != 0) {
|
146
|
-
memcpy(
|
147
|
-
|
148
|
-
|
140
|
+
memcpy(b, term->field, sizeof(char) * flen);
|
141
|
+
b[flen] = ':';
|
142
|
+
b += flen + 1;
|
149
143
|
}
|
150
|
-
memcpy(
|
151
|
-
|
152
|
-
*
|
144
|
+
memcpy(b, term->text, tlen);
|
145
|
+
b += tlen;
|
146
|
+
*b = 0;
|
153
147
|
if (self->boost != 1.0) {
|
154
|
-
|
155
|
-
dbl_to_s(
|
156
|
-
sprintf(bp, "^%s", dbuf);
|
148
|
+
*b = '^';
|
149
|
+
dbl_to_s(b+1, self->boost);
|
157
150
|
}
|
158
151
|
return buffer;
|
159
152
|
}
|
@@ -268,9 +261,7 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
|
|
268
261
|
tde->close(tde);
|
269
262
|
ts->tde = NULL;
|
270
263
|
Explanation *tf_explanation = expl_create(sim_tf(self->similarity, tf),
|
271
|
-
|
272
|
-
strlen(term->field) + strlen(term->text) + 20,
|
273
|
-
term->field, term->text, tf));
|
264
|
+
strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
|
274
265
|
|
275
266
|
return tf_explanation;
|
276
267
|
}
|
data/ext/r_analysis.c
CHANGED
@@ -1,17 +1,62 @@
|
|
1
1
|
#include "ferret.h"
|
2
2
|
#include "analysis.h"
|
3
|
+
#include "locale.h"
|
3
4
|
|
4
5
|
static VALUE cToken;
|
6
|
+
static VALUE cAsciiLetterTokenizer;
|
5
7
|
static VALUE cLetterTokenizer;
|
8
|
+
static VALUE cAsciiWhiteSpaceTokenizer;
|
9
|
+
static VALUE cWhiteSpaceTokenizer;
|
10
|
+
static VALUE cAsciiStandardTokenizer;
|
11
|
+
static VALUE cStandardTokenizer;
|
12
|
+
|
13
|
+
static VALUE cAsciiLowerCaseFilter;
|
14
|
+
static VALUE cLowerCaseFilter;
|
15
|
+
static VALUE cStopFilter;
|
16
|
+
static VALUE cStemFilter;
|
6
17
|
|
7
18
|
static VALUE cAnalyzer;
|
19
|
+
static VALUE cAsciiLetterAnalyzer;
|
8
20
|
static VALUE cLetterAnalyzer;
|
21
|
+
static VALUE cAsciiWhiteSpaceAnalyzer;
|
9
22
|
static VALUE cWhiteSpaceAnalyzer;
|
23
|
+
static VALUE cAsciiStandardAnalyzer;
|
10
24
|
static VALUE cStandardAnalyzer;
|
25
|
+
static VALUE cPerFieldAnalyzer;
|
26
|
+
|
27
|
+
//static VALUE cRegexAnalyzer;
|
28
|
+
static VALUE cTokenStream;
|
29
|
+
|
30
|
+
static ID id_next;
|
31
|
+
static ID id_reset;
|
32
|
+
static ID id_clone;
|
11
33
|
|
12
34
|
/****************************************************************************
|
13
35
|
*
|
14
|
-
*
|
36
|
+
* Utility Methods
|
37
|
+
*
|
38
|
+
****************************************************************************/
|
39
|
+
|
40
|
+
static char **
|
41
|
+
get_stopwords(VALUE rstop_words)
|
42
|
+
{
|
43
|
+
char **stop_words;
|
44
|
+
int i, len;
|
45
|
+
VALUE rstr;
|
46
|
+
Check_Type(rstop_words, T_ARRAY);
|
47
|
+
len = RARRAY(rstop_words)->len;
|
48
|
+
stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
|
49
|
+
stop_words[len] = NULL;
|
50
|
+
for (i = 0; i < len; i++) {
|
51
|
+
rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
|
52
|
+
stop_words[i] = RSTRING(rstr)->ptr;
|
53
|
+
}
|
54
|
+
return stop_words;
|
55
|
+
}
|
56
|
+
|
57
|
+
/****************************************************************************
|
58
|
+
*
|
59
|
+
* token methods
|
15
60
|
*
|
16
61
|
****************************************************************************/
|
17
62
|
|
@@ -41,6 +86,31 @@ frt_token_alloc(VALUE klass)
|
|
41
86
|
return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
|
42
87
|
}
|
43
88
|
|
89
|
+
static VALUE
|
90
|
+
get_token(Token *tk)
|
91
|
+
{
|
92
|
+
RToken *token = ALLOC(RToken);
|
93
|
+
|
94
|
+
token->text = rb_str_new2(tk->text);
|
95
|
+
token->start = tk->start;
|
96
|
+
token->end = tk->end;
|
97
|
+
token->pos_inc = tk->pos_inc;
|
98
|
+
return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
|
99
|
+
}
|
100
|
+
|
101
|
+
Token *
|
102
|
+
frt_set_token(Token *tk, VALUE rt)
|
103
|
+
{
|
104
|
+
RToken *rtk;
|
105
|
+
|
106
|
+
if (rt == Qnil) return NULL;
|
107
|
+
|
108
|
+
Data_Get_Struct(rt, RToken, rtk);
|
109
|
+
tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
|
110
|
+
rtk->start, rtk->end, rtk->pos_inc);
|
111
|
+
return tk;
|
112
|
+
}
|
113
|
+
|
44
114
|
#define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
|
45
115
|
static VALUE
|
46
116
|
frt_token_init(int argc, VALUE *argv, VALUE self)
|
@@ -129,23 +199,270 @@ frt_token_to_s(VALUE self)
|
|
129
199
|
|
130
200
|
/****************************************************************************
|
131
201
|
*
|
132
|
-
*
|
202
|
+
* TokenStream Methods
|
133
203
|
*
|
134
204
|
****************************************************************************/
|
135
205
|
|
136
206
|
static void
|
137
|
-
|
207
|
+
frt_ts_mark(void *p)
|
138
208
|
{
|
139
209
|
TokenStream *ts = (TokenStream *)p;
|
140
|
-
|
210
|
+
if (ts->text) frt_gc_mark(&ts->text);
|
211
|
+
if (ts->sub_ts) frt_gc_mark(&ts->sub_ts);
|
212
|
+
}
|
213
|
+
|
214
|
+
static void
|
215
|
+
frt_ts_free(void *p)
|
216
|
+
{
|
217
|
+
TokenStream *ts = (TokenStream *)p;
|
218
|
+
if (object_get(&ts->text) != Qnil) object_del(&ts->text);
|
219
|
+
if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
|
220
|
+
object_del(ts);
|
141
221
|
ts->destroy(ts);
|
142
222
|
}
|
143
223
|
|
144
224
|
static VALUE
|
145
|
-
|
225
|
+
get_token_stream(TokenStream *ts)
|
226
|
+
{
|
227
|
+
VALUE rts = object_get(ts);
|
228
|
+
if (rts == Qnil) {
|
229
|
+
rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark, &frt_ts_free, ts);
|
230
|
+
object_add(ts, rts);
|
231
|
+
}
|
232
|
+
return rts;
|
233
|
+
}
|
234
|
+
|
235
|
+
static inline VALUE
|
236
|
+
get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
146
237
|
{
|
147
|
-
|
148
|
-
|
238
|
+
rstr = rb_obj_as_string(rstr);
|
239
|
+
ts->reset(ts, RSTRING(rstr)->ptr);
|
240
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
241
|
+
object_add(&ts->text, rstr);
|
242
|
+
object_add(ts, self);
|
243
|
+
return self;
|
244
|
+
}
|
245
|
+
|
246
|
+
static VALUE
|
247
|
+
frt_ts_set_text(VALUE self, VALUE rtext)
|
248
|
+
{
|
249
|
+
TokenStream *ts;
|
250
|
+
Data_Get_Struct(self, TokenStream, ts);
|
251
|
+
rtext = rb_obj_as_string(rtext);
|
252
|
+
ts->reset(ts, RSTRING(rtext)->ptr);
|
253
|
+
object_set(&ts->text, rtext);
|
254
|
+
|
255
|
+
return rtext;
|
256
|
+
}
|
257
|
+
|
258
|
+
static VALUE
|
259
|
+
frt_ts_get_text(VALUE self)
|
260
|
+
{
|
261
|
+
VALUE rtext = Qnil;
|
262
|
+
TokenStream *ts;
|
263
|
+
Data_Get_Struct(self, TokenStream, ts);
|
264
|
+
if (ts->text) {
|
265
|
+
if ((rtext = object_get(&ts->text)) == Qnil) {
|
266
|
+
rtext = rb_str_new2(ts->text);
|
267
|
+
object_set(&ts->text, rtext);
|
268
|
+
}
|
269
|
+
}
|
270
|
+
return rtext;
|
271
|
+
}
|
272
|
+
|
273
|
+
static VALUE
|
274
|
+
frt_ts_next(VALUE self)
|
275
|
+
{
|
276
|
+
TokenStream *ts;
|
277
|
+
Data_Get_Struct(self, TokenStream, ts);
|
278
|
+
Token *next = ts->next(ts);
|
279
|
+
if (next == NULL) {
|
280
|
+
return Qnil;
|
281
|
+
}
|
282
|
+
|
283
|
+
return get_token(next);
|
284
|
+
}
|
285
|
+
|
286
|
+
/****************************************************************************
|
287
|
+
* CWrappedTokenStream
|
288
|
+
****************************************************************************/
|
289
|
+
|
290
|
+
void cwrts_destroy(void *p)
|
291
|
+
{
|
292
|
+
TokenStream *ts = (TokenStream *)p;
|
293
|
+
free(ts->token);
|
294
|
+
free(ts);
|
295
|
+
}
|
296
|
+
|
297
|
+
Token *cwrts_next(TokenStream *ts)
|
298
|
+
{
|
299
|
+
VALUE rts = (VALUE)ts->data;
|
300
|
+
VALUE rtoken = rb_funcall(rts, id_next, 0);
|
301
|
+
return frt_set_token(ts->token, rtoken);
|
302
|
+
}
|
303
|
+
|
304
|
+
void cwrts_reset(TokenStream *ts, char *text)
|
305
|
+
{
|
306
|
+
VALUE rts = (VALUE)ts->data;
|
307
|
+
ts->t = ts->text = text;
|
308
|
+
rb_funcall(rts, id_reset, 1, rb_str_new2(text));
|
309
|
+
}
|
310
|
+
|
311
|
+
void cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
312
|
+
{
|
313
|
+
VALUE rorig_ts = (VALUE)orig_ts->data;
|
314
|
+
new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
|
315
|
+
}
|
316
|
+
|
317
|
+
static TokenStream *
|
318
|
+
get_cwrapped_rts(VALUE rts, bool *self_destroy)
|
319
|
+
{
|
320
|
+
TokenStream *ts;
|
321
|
+
switch (TYPE(rts)) {
|
322
|
+
case T_DATA:
|
323
|
+
Data_Get_Struct(rts, TokenStream, ts);
|
324
|
+
*self_destroy = true;
|
325
|
+
break;
|
326
|
+
default:
|
327
|
+
ts = ALLOC(TokenStream);
|
328
|
+
ts->token = ALLOC(Token);
|
329
|
+
ts->data = (void *)rts;
|
330
|
+
ts->next = &cwrts_next;
|
331
|
+
ts->reset = &cwrts_reset;
|
332
|
+
ts->clone_i = &cwrts_clone_i;
|
333
|
+
ts->destroy = &cwrts_destroy;
|
334
|
+
ts->sub_ts = NULL;
|
335
|
+
*self_destroy = false;
|
336
|
+
break;
|
337
|
+
}
|
338
|
+
return ts;
|
339
|
+
}
|
340
|
+
|
341
|
+
/****************************************************************************
|
342
|
+
* Tokenizers
|
343
|
+
****************************************************************************/
|
344
|
+
|
345
|
+
#define TS_ARGS(dflt) \
|
346
|
+
bool lower;\
|
347
|
+
VALUE rlower, rstr;\
|
348
|
+
rb_scan_args(argc, argv, "11", &rstr, &rlower);\
|
349
|
+
lower = (argc ? RTEST(rlower) : dflt)
|
350
|
+
|
351
|
+
static VALUE
|
352
|
+
frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
353
|
+
{
|
354
|
+
return get_wrapped_ts(self, rstr, letter_tokenizer_create());
|
355
|
+
}
|
356
|
+
|
357
|
+
static VALUE
|
358
|
+
frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
359
|
+
{
|
360
|
+
TS_ARGS(false);
|
361
|
+
return get_wrapped_ts(self, rstr, mb_letter_tokenizer_create(lower));
|
362
|
+
}
|
363
|
+
|
364
|
+
static VALUE
|
365
|
+
frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
366
|
+
{
|
367
|
+
return get_wrapped_ts(self, rstr, whitespace_tokenizer_create());
|
368
|
+
}
|
369
|
+
|
370
|
+
static VALUE
|
371
|
+
frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
372
|
+
{
|
373
|
+
TS_ARGS(false);
|
374
|
+
return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_create(lower));
|
375
|
+
}
|
376
|
+
|
377
|
+
static VALUE
|
378
|
+
frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
379
|
+
{
|
380
|
+
return get_wrapped_ts(self, rstr, standard_tokenizer_create());
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE
|
384
|
+
frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
385
|
+
{
|
386
|
+
return get_wrapped_ts(self, rstr, mb_standard_tokenizer_create());
|
387
|
+
}
|
388
|
+
|
389
|
+
/****************************************************************************
|
390
|
+
* Filters
|
391
|
+
****************************************************************************/
|
392
|
+
|
393
|
+
|
394
|
+
static VALUE
|
395
|
+
frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
396
|
+
{
|
397
|
+
bool self_destroy;
|
398
|
+
TokenStream *ts = lowercase_filter_create(
|
399
|
+
get_cwrapped_rts(rsub_ts, &self_destroy));
|
400
|
+
ts->destroy_sub = !self_destroy;
|
401
|
+
object_add(&ts->sub_ts, rsub_ts);
|
402
|
+
|
403
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
404
|
+
object_add(ts, self);
|
405
|
+
return self;
|
406
|
+
}
|
407
|
+
|
408
|
+
static VALUE
|
409
|
+
frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
410
|
+
{
|
411
|
+
bool self_destroy;
|
412
|
+
TokenStream *ts = mb_lowercase_filter_create(
|
413
|
+
get_cwrapped_rts(rsub_ts, &self_destroy));
|
414
|
+
ts->destroy_sub = !self_destroy;
|
415
|
+
object_add(&ts->sub_ts, rsub_ts);
|
416
|
+
|
417
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
418
|
+
object_add(ts, self);
|
419
|
+
return self;
|
420
|
+
}
|
421
|
+
|
422
|
+
static VALUE
|
423
|
+
frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
424
|
+
{
|
425
|
+
VALUE rsub_ts, rstop_words;
|
426
|
+
bool self_destroy;
|
427
|
+
TokenStream *ts;
|
428
|
+
rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
|
429
|
+
if (rstop_words != Qnil) {
|
430
|
+
char **stop_words = get_stopwords(rstop_words);
|
431
|
+
ts = stop_filter_create_with_words(
|
432
|
+
get_cwrapped_rts(rsub_ts, &self_destroy), (const char **)stop_words);
|
433
|
+
free(stop_words);
|
434
|
+
} else {
|
435
|
+
ts = stop_filter_create(
|
436
|
+
get_cwrapped_rts(rsub_ts, &self_destroy));
|
437
|
+
}
|
438
|
+
ts->destroy_sub = !self_destroy;
|
439
|
+
object_add(&ts->sub_ts, rsub_ts);
|
440
|
+
|
441
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
442
|
+
object_add(ts, self);
|
443
|
+
return self;
|
444
|
+
}
|
445
|
+
|
446
|
+
static VALUE
|
447
|
+
frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
448
|
+
{
|
449
|
+
VALUE rsub_ts, ralgorithm, rcharenc;
|
450
|
+
char *algorithm = "english";
|
451
|
+
char *charenc = NULL;
|
452
|
+
bool self_destroy;
|
453
|
+
TokenStream *ts;
|
454
|
+
rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
|
455
|
+
switch (argc) {
|
456
|
+
case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
|
457
|
+
case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
|
458
|
+
}
|
459
|
+
ts = stem_filter_create(
|
460
|
+
get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
|
461
|
+
ts->destroy_sub = !self_destroy;
|
462
|
+
object_add(&ts->sub_ts, rsub_ts);
|
463
|
+
|
464
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
465
|
+
object_add(ts, self);
|
149
466
|
return self;
|
150
467
|
}
|
151
468
|
|
@@ -155,6 +472,28 @@ frt_letter_tokenizer_init(VALUE self, VALUE rstr)
|
|
155
472
|
*
|
156
473
|
****************************************************************************/
|
157
474
|
|
475
|
+
Analyzer *get_cwrapped_analyzer(ranalyzer)
|
476
|
+
{
|
477
|
+
Analyzer *a = NULL;
|
478
|
+
switch (TYPE(ranalyzer)) {
|
479
|
+
case T_DATA:
|
480
|
+
Data_Get_Struct(ranalyzer, Analyzer, a);
|
481
|
+
break;
|
482
|
+
default:
|
483
|
+
printf("Oh RFuck\n");
|
484
|
+
//ts = ALLOC(TokenStream);
|
485
|
+
//ts->token = ALLOC(Token);
|
486
|
+
//ts->data = (void *)rts;
|
487
|
+
//ts->next = &cwrts_next;
|
488
|
+
//ts->reset = &cwrts_reset;
|
489
|
+
//ts->clone_i = &cwrts_clone_i;
|
490
|
+
//ts->destroy = &cwrts_destroy;
|
491
|
+
//ts->sub_ts = NULL;
|
492
|
+
break;
|
493
|
+
}
|
494
|
+
return a;
|
495
|
+
}
|
496
|
+
|
158
497
|
static void
|
159
498
|
frt_analyzer_free(void *p)
|
160
499
|
{
|
@@ -171,11 +510,53 @@ frt_get_analyzer(Analyzer *a)
|
|
171
510
|
return self;
|
172
511
|
}
|
173
512
|
|
513
|
+
static VALUE
|
514
|
+
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
515
|
+
{
|
516
|
+
Analyzer *a = ((struct RData *)(self))->data;
|
517
|
+
rfield = rb_obj_as_string(rfield);
|
518
|
+
rstring = rb_obj_as_string(rstring);
|
519
|
+
|
520
|
+
TokenStream *ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
|
521
|
+
|
522
|
+
object_set(&ts->text, rstring); // Make sure that there is no entry already
|
523
|
+
return get_token_stream(ts);
|
524
|
+
}
|
525
|
+
|
526
|
+
#define GET_LOWER(dflt) \
|
527
|
+
bool lower;\
|
528
|
+
VALUE rlower;\
|
529
|
+
rb_scan_args(argc, argv, "01", &rlower);\
|
530
|
+
lower = (argc ? RTEST(rlower) : dflt)
|
531
|
+
|
532
|
+
/*** AsciiWhiteSpaceAnalyzer ***/
|
533
|
+
static VALUE
|
534
|
+
frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
535
|
+
{
|
536
|
+
GET_LOWER(false);
|
537
|
+
Analyzer *a = whitespace_analyzer_create(lower);
|
538
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
539
|
+
object_add(a, self);
|
540
|
+
return self;
|
541
|
+
}
|
542
|
+
|
174
543
|
/*** WhiteSpaceAnalyzer ***/
|
175
544
|
static VALUE
|
176
|
-
frt_white_space_analyzer_init(VALUE self)
|
545
|
+
frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
546
|
+
{
|
547
|
+
GET_LOWER(false);
|
548
|
+
Analyzer *a = mb_whitespace_analyzer_create(lower);
|
549
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
550
|
+
object_add(a, self);
|
551
|
+
return self;
|
552
|
+
}
|
553
|
+
|
554
|
+
/*** AsciiLetterAnalyzer ***/
|
555
|
+
static VALUE
|
556
|
+
frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
177
557
|
{
|
178
|
-
|
558
|
+
GET_LOWER(true);
|
559
|
+
Analyzer *a = letter_analyzer_create(lower);
|
179
560
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
180
561
|
object_add(a, self);
|
181
562
|
return self;
|
@@ -183,9 +564,44 @@ frt_white_space_analyzer_init(VALUE self)
|
|
183
564
|
|
184
565
|
/*** LetterAnalyzer ***/
|
185
566
|
static VALUE
|
186
|
-
frt_letter_analyzer_init(VALUE self)
|
567
|
+
frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
187
568
|
{
|
188
|
-
|
569
|
+
GET_LOWER(true);
|
570
|
+
Analyzer *a = mb_letter_analyzer_create(lower);
|
571
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
572
|
+
object_add(a, self);
|
573
|
+
return self;
|
574
|
+
}
|
575
|
+
|
576
|
+
static VALUE
|
577
|
+
get_rstopwords(const char **stop_words)
|
578
|
+
{
|
579
|
+
char **w = (char **)stop_words;
|
580
|
+
VALUE rstopwords = rb_ary_new();
|
581
|
+
|
582
|
+
while (*w) {
|
583
|
+
rb_ary_push(rstopwords, rb_str_new2(*w));
|
584
|
+
w++;
|
585
|
+
}
|
586
|
+
return rstopwords;
|
587
|
+
}
|
588
|
+
|
589
|
+
/*** AsciiStandardAnalyzer ***/
|
590
|
+
static VALUE
|
591
|
+
frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
592
|
+
{
|
593
|
+
bool lower;
|
594
|
+
VALUE rlower, rstop_words;
|
595
|
+
Analyzer *a;
|
596
|
+
rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
|
597
|
+
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
598
|
+
if (rstop_words != Qnil) {
|
599
|
+
char **stop_words = get_stopwords(rstop_words);
|
600
|
+
a = standard_analyzer_create_with_words((const char **)stop_words, lower);
|
601
|
+
free(stop_words);
|
602
|
+
} else {
|
603
|
+
a = standard_analyzer_create(lower);
|
604
|
+
}
|
189
605
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
190
606
|
object_add(a, self);
|
191
607
|
return self;
|
@@ -193,14 +609,98 @@ frt_letter_analyzer_init(VALUE self)
|
|
193
609
|
|
194
610
|
/*** StandardAnalyzer ***/
|
195
611
|
static VALUE
|
196
|
-
frt_standard_analyzer_init(VALUE self)
|
612
|
+
frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
197
613
|
{
|
198
|
-
|
614
|
+
bool lower;
|
615
|
+
VALUE rlower, rstop_words;
|
616
|
+
Analyzer *a;
|
617
|
+
rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
|
618
|
+
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
619
|
+
if (rstop_words != Qnil) {
|
620
|
+
char **stop_words = get_stopwords(rstop_words);
|
621
|
+
a = mb_standard_analyzer_create_with_words((const char **)stop_words, lower);
|
622
|
+
free(stop_words);
|
623
|
+
} else {
|
624
|
+
a = mb_standard_analyzer_create(lower);
|
625
|
+
}
|
199
626
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
200
627
|
object_add(a, self);
|
201
628
|
return self;
|
202
629
|
}
|
203
630
|
|
631
|
+
/*** PerFieldAnalyzer ***/
|
632
|
+
static VALUE
|
633
|
+
frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
634
|
+
{
|
635
|
+
Analyzer *def = get_cwrapped_analyzer(ranalyzer);
|
636
|
+
Analyzer *a = per_field_analyzer_create(def, false);
|
637
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
638
|
+
object_add(a, self);
|
639
|
+
return self;
|
640
|
+
}
|
641
|
+
|
642
|
+
static VALUE
|
643
|
+
frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
644
|
+
{
|
645
|
+
Analyzer *pfa, *a;
|
646
|
+
Data_Get_Struct(self, Analyzer, pfa);
|
647
|
+
Data_Get_Struct(ranalyzer, Analyzer, a);
|
648
|
+
|
649
|
+
pfa_add_field(pfa, StringValuePtr(rfield), a);
|
650
|
+
return self;
|
651
|
+
}
|
652
|
+
|
653
|
+
|
654
|
+
/** RegexAnalyzer **/
|
655
|
+
/*
|
656
|
+
static VALUE
|
657
|
+
frt_regex_analyzer_init(VALUE self)
|
658
|
+
{
|
659
|
+
Analyzer *a = regex_analyzer_create();
|
660
|
+
// keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
|
661
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
662
|
+
// wofuer?:
|
663
|
+
object_add(a, self);
|
664
|
+
return self;
|
665
|
+
}
|
666
|
+
|
667
|
+
// convenience method
|
668
|
+
// XXX this sets the locale for the entire program
|
669
|
+
static VALUE
|
670
|
+
frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
|
671
|
+
{
|
672
|
+
Analyzer *a =((struct RData *)(self))->data;
|
673
|
+
TokenStream *ts = a->get_ts( a, StringValuePtr(field), StringValuePtr(string) );
|
674
|
+
// already freed via analyzer's free()
|
675
|
+
VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
|
676
|
+
return token_stream;
|
677
|
+
}
|
678
|
+
*/
|
679
|
+
/** /RegexAnalyzer **/
|
680
|
+
|
681
|
+
/** TokenStream **/
|
682
|
+
/** /TokenStream **/
|
683
|
+
|
684
|
+
/****************************************************************************
|
685
|
+
*
|
686
|
+
* Locale stuff
|
687
|
+
*
|
688
|
+
****************************************************************************/
|
689
|
+
|
690
|
+
static char *frt_locale = NULL;
|
691
|
+
|
692
|
+
static VALUE frt_getlocale(VALUE self, VALUE locale)
|
693
|
+
{
|
694
|
+
return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
|
695
|
+
}
|
696
|
+
|
697
|
+
static VALUE frt_setlocale(VALUE self, VALUE locale)
|
698
|
+
{
|
699
|
+
char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
|
700
|
+
frt_locale = setlocale(LC_ALL, l);
|
701
|
+
return frt_locale ? rb_str_new2(frt_locale) : Qnil;
|
702
|
+
}
|
703
|
+
|
204
704
|
/****************************************************************************
|
205
705
|
*
|
206
706
|
* Init Function
|
@@ -210,6 +710,18 @@ frt_standard_analyzer_init(VALUE self)
|
|
210
710
|
void
|
211
711
|
Init_analysis(void)
|
212
712
|
{
|
713
|
+
id_next = rb_intern("next");
|
714
|
+
id_reset = rb_intern("text=");
|
715
|
+
id_clone = rb_intern("clone");
|
716
|
+
|
717
|
+
/*** * * Locale stuff * * ***/
|
718
|
+
frt_locale = setlocale(LC_ALL, "");
|
719
|
+
rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
|
720
|
+
rb_define_singleton_method(mFerret, "locale", frt_getlocale, 0);
|
721
|
+
|
722
|
+
/*********************/
|
723
|
+
/*** * * Token * * ***/
|
724
|
+
/*********************/
|
213
725
|
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
214
726
|
rb_define_alloc_func(cToken, frt_token_alloc);
|
215
727
|
rb_include_module(cToken, rb_mComparable);
|
@@ -223,33 +735,193 @@ Init_analysis(void)
|
|
223
735
|
rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
|
224
736
|
rb_define_method(cToken, "to_s", frt_token_to_s, 0);
|
225
737
|
|
738
|
+
/****************************/
|
739
|
+
/*** * * TokenStreams * * ***/
|
740
|
+
/****************************/
|
741
|
+
|
742
|
+
cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
743
|
+
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
744
|
+
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
745
|
+
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
746
|
+
|
747
|
+
/******************/
|
748
|
+
/*** Tokenizers ***/
|
749
|
+
/******************/
|
750
|
+
|
751
|
+
/*** * * AsciiLetterTokenizer * * ***/
|
752
|
+
cAsciiLetterTokenizer =
|
753
|
+
rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
754
|
+
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
755
|
+
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
756
|
+
frt_a_letter_tokenizer_init, 1);
|
757
|
+
|
758
|
+
/*** * * LetterTokenizer * * ***/
|
226
759
|
cLetterTokenizer =
|
227
|
-
rb_define_class_under(mAnalysis, "LetterTokenizer",
|
760
|
+
rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
228
761
|
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
229
762
|
rb_define_method(cLetterTokenizer, "initialize",
|
230
|
-
frt_letter_tokenizer_init, 1);
|
763
|
+
frt_letter_tokenizer_init, -1);
|
764
|
+
|
765
|
+
/*** * * AsciiWhiteSpaceTokenizer * * ***/
|
766
|
+
cAsciiWhiteSpaceTokenizer =
|
767
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer", cTokenStream);
|
768
|
+
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
769
|
+
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
770
|
+
frt_a_whitespace_tokenizer_init, 1);
|
771
|
+
|
772
|
+
/*** * * WhiteSpaceTokenizer * * ***/
|
773
|
+
cWhiteSpaceTokenizer =
|
774
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
775
|
+
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
776
|
+
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
777
|
+
frt_whitespace_tokenizer_init, -1);
|
778
|
+
|
779
|
+
/*** * * AsciiStandardTokenizer * * ***/
|
780
|
+
cAsciiStandardTokenizer =
|
781
|
+
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
782
|
+
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
783
|
+
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
784
|
+
frt_a_standard_tokenizer_init, 1);
|
231
785
|
|
786
|
+
/*** * * StandardTokenizer * * ***/
|
787
|
+
cStandardTokenizer =
|
788
|
+
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
789
|
+
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
790
|
+
rb_define_method(cStandardTokenizer, "initialize",
|
791
|
+
frt_standard_tokenizer_init, 1);
|
792
|
+
|
793
|
+
/***************/
|
794
|
+
/*** Filters ***/
|
795
|
+
/***************/
|
796
|
+
rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
|
797
|
+
get_rstopwords(ENGLISH_STOP_WORDS));
|
798
|
+
rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
|
799
|
+
get_rstopwords(FULL_ENGLISH_STOP_WORDS));
|
800
|
+
rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
|
801
|
+
get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
|
802
|
+
rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
|
803
|
+
get_rstopwords(FULL_FRENCH_STOP_WORDS));
|
804
|
+
rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
|
805
|
+
get_rstopwords(FULL_SPANISH_STOP_WORDS));
|
806
|
+
rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
|
807
|
+
get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
|
808
|
+
rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
|
809
|
+
get_rstopwords(FULL_ITALIAN_STOP_WORDS));
|
810
|
+
rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
|
811
|
+
get_rstopwords(FULL_GERMAN_STOP_WORDS));
|
812
|
+
rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
|
813
|
+
get_rstopwords(FULL_DUTCH_STOP_WORDS));
|
814
|
+
rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
|
815
|
+
get_rstopwords(FULL_SWEDISH_STOP_WORDS));
|
816
|
+
rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
|
817
|
+
get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
|
818
|
+
rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
|
819
|
+
get_rstopwords(FULL_DANISH_STOP_WORDS));
|
820
|
+
rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
|
821
|
+
get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
|
822
|
+
rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
|
823
|
+
get_rstopwords(FULL_FINNISH_STOP_WORDS));
|
824
|
+
|
825
|
+
cAsciiLowerCaseFilter =
|
826
|
+
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
827
|
+
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
828
|
+
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
829
|
+
frt_a_lowercase_filter_init, 1);
|
830
|
+
|
831
|
+
cLowerCaseFilter =
|
832
|
+
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
833
|
+
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
834
|
+
rb_define_method(cLowerCaseFilter, "initialize",
|
835
|
+
frt_lowercase_filter_init, 1);
|
836
|
+
|
837
|
+
cStopFilter =
|
838
|
+
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
839
|
+
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
840
|
+
rb_define_method(cStopFilter, "initialize",
|
841
|
+
frt_stop_filter_init, -1);
|
842
|
+
|
843
|
+
cStemFilter =
|
844
|
+
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
845
|
+
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
846
|
+
rb_define_method(cStemFilter, "initialize",
|
847
|
+
frt_stem_filter_init, -1);
|
848
|
+
|
849
|
+
|
850
|
+
/*************************/
|
851
|
+
/*** * * Analyzers * * ***/
|
852
|
+
/*************************/
|
853
|
+
|
854
|
+
/*** * * Analyzer * * ***/
|
232
855
|
cAnalyzer =
|
233
856
|
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
234
857
|
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
235
|
-
rb_define_method(cAnalyzer, "initialize",
|
236
|
-
|
858
|
+
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
859
|
+
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
860
|
+
|
861
|
+
/*** * * AsciiLetterAnalyzer * * ***/
|
862
|
+
cAsciiLetterAnalyzer =
|
863
|
+
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
864
|
+
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
865
|
+
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
866
|
+
frt_a_letter_analyzer_init, -1);
|
237
867
|
|
868
|
+
/*** * * LetterAnalyzer * * ***/
|
238
869
|
cLetterAnalyzer =
|
239
870
|
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
240
871
|
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
241
|
-
rb_define_method(
|
242
|
-
|
872
|
+
rb_define_method(cLetterAnalyzer, "initialize",
|
873
|
+
frt_letter_analyzer_init, -1);
|
243
874
|
|
875
|
+
/*** * * AsciiWhiteSpaceAnalyzer * * ***/
|
876
|
+
cAsciiWhiteSpaceAnalyzer =
|
877
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
878
|
+
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
879
|
+
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
880
|
+
frt_a_white_space_analyzer_init, -1);
|
881
|
+
|
882
|
+
/*** * * WhiteSpaceAnalyzer * * ***/
|
244
883
|
cWhiteSpaceAnalyzer =
|
245
884
|
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
246
885
|
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
247
886
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
248
|
-
frt_white_space_analyzer_init,
|
887
|
+
frt_white_space_analyzer_init, -1);
|
888
|
+
|
889
|
+
/*** * * AsciiStandardAnalyzer * * ***/
|
890
|
+
cAsciiStandardAnalyzer =
|
891
|
+
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
892
|
+
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
893
|
+
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
894
|
+
frt_a_standard_analyzer_init, -1);
|
249
895
|
|
896
|
+
/*** * * StandardAnalyzer * * ***/
|
250
897
|
cStandardAnalyzer =
|
251
898
|
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
252
899
|
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
253
900
|
rb_define_method(cStandardAnalyzer, "initialize",
|
254
|
-
frt_standard_analyzer_init,
|
901
|
+
frt_standard_analyzer_init, -1);
|
902
|
+
|
903
|
+
/*** * * PerFieldAnalyzer * * ***/
|
904
|
+
cPerFieldAnalyzer =
|
905
|
+
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
906
|
+
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
907
|
+
rb_define_method(cPerFieldAnalyzer, "initialize",
|
908
|
+
frt_per_field_analyzer_init, 1);
|
909
|
+
rb_define_method(cPerFieldAnalyzer, "add_field",
|
910
|
+
frt_per_field_analyzer_add_field, 2);
|
911
|
+
rb_define_method(cPerFieldAnalyzer, "[]=",
|
912
|
+
frt_per_field_analyzer_add_field, 2);
|
913
|
+
|
914
|
+
/** RegexAnalyzer **/
|
915
|
+
/*
|
916
|
+
cRegexAnalyzer =
|
917
|
+
rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
|
918
|
+
rb_define_alloc_func(cRegexAnalyzer, frt_data_alloc);
|
919
|
+
rb_define_method(cRegexAnalyzer, "initialize",
|
920
|
+
frt_regex_analyzer_init, 0);
|
921
|
+
rb_define_method(cRegexAnalyzer, "token_stream",
|
922
|
+
frt_regex_analyzer_token_stream, 2);
|
923
|
+
rb_define_method(cRegexAnalyzer, "setlocale",
|
924
|
+
frt_regex_analyzer_setlocale, 1);
|
925
|
+
*/
|
926
|
+
|
255
927
|
}
|