ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_wildcard.c
CHANGED
@@ -7,152 +7,165 @@
|
|
7
7
|
*
|
8
8
|
****************************************************************************/
|
9
9
|
|
10
|
-
|
11
|
-
{
|
12
|
-
char *buffer, *bptr;
|
13
|
-
Term *term = (Term *)self->data;
|
14
|
-
size_t tlen = strlen(term->text);
|
15
|
-
size_t flen = strlen(term->field);
|
16
|
-
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
17
|
-
|
18
|
-
if (strcmp(term->field, field) != 0) {
|
19
|
-
sprintf(bptr, "%s:", term->field);
|
20
|
-
bptr += strlen(term->field) + 1;
|
21
|
-
}
|
22
|
-
sprintf(bptr, "%s", term->text);
|
23
|
-
bptr = buffer + strlen(buffer);
|
24
|
-
if (self->boost != 1.0) {
|
25
|
-
*bptr = '^';
|
26
|
-
dbl_to_s(++bptr, self->boost);
|
27
|
-
}
|
28
|
-
|
29
|
-
return buffer;
|
30
|
-
}
|
10
|
+
#define WCQ(query) ((WildCardQuery *)(query))
|
31
11
|
|
32
|
-
|
12
|
+
static char *wcq_to_s(Query *self, const char *current_field)
|
33
13
|
{
|
34
|
-
|
14
|
+
char *buffer, *bptr;
|
15
|
+
const char *field = WCQ(self)->field;
|
16
|
+
const char *pattern = WCQ(self)->pattern;
|
17
|
+
size_t flen = strlen(field);
|
18
|
+
size_t plen = strlen(pattern);
|
19
|
+
bptr = buffer = ALLOC_N(char, plen + flen + 35);
|
20
|
+
|
21
|
+
if (strcmp(field, current_field) != 0) {
|
22
|
+
sprintf(bptr, "%s:", field);
|
23
|
+
bptr += flen + 1;
|
24
|
+
}
|
25
|
+
sprintf(bptr, "%s", pattern);
|
26
|
+
bptr += plen;
|
35
27
|
|
36
|
-
|
37
|
-
|
28
|
+
if (self->boost != 1.0) {
|
29
|
+
*bptr = '^';
|
30
|
+
dbl_to_s(++bptr, self->boost);
|
31
|
+
}
|
38
32
|
|
39
|
-
|
33
|
+
return buffer;
|
34
|
+
}
|
40
35
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
36
|
+
bool wc_match(const char *pattern, const char *text)
|
37
|
+
{
|
38
|
+
const char *p = pattern, *t = text, *xt;
|
39
|
+
|
40
|
+
/* include '\0' as we need to match empty string */
|
41
|
+
const char *text_last = t + strlen(t);
|
42
|
+
|
43
|
+
for (;; p++, t++) {
|
44
|
+
|
45
|
+
/* end of text so make sure end of pattern doesn't matter */
|
46
|
+
if (*t == '\0') {
|
47
|
+
while (*p) {
|
48
|
+
if (*p != WILD_STRING) {
|
49
|
+
return false;
|
50
|
+
}
|
51
|
+
p++;
|
52
|
+
}
|
53
|
+
return true;
|
54
|
+
}
|
49
55
|
|
50
|
-
|
51
|
-
|
56
|
+
/* If we've gone past the end of the pattern, return false. */
|
57
|
+
if (*p == '\0') {
|
58
|
+
return false;
|
59
|
+
}
|
52
60
|
|
53
|
-
|
54
|
-
|
61
|
+
/* Match a single character, so continue. */
|
62
|
+
if (*p == WILD_CHAR) {
|
63
|
+
continue;
|
64
|
+
}
|
55
65
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
66
|
+
if (*p == WILD_STRING) {
|
67
|
+
/* Look at the character beyond the '*'. */
|
68
|
+
p++;
|
69
|
+
/* Examine the string, starting at the last character. */
|
70
|
+
for (xt = text_last; xt >= t; xt--) {
|
71
|
+
if (wc_match(p, xt)) return true;
|
72
|
+
}
|
73
|
+
return false;
|
74
|
+
}
|
75
|
+
if (*p != *t) {
|
76
|
+
return false;
|
77
|
+
}
|
64
78
|
}
|
65
|
-
if (*p != *t)
|
66
|
-
return false;
|
67
|
-
}
|
68
79
|
|
69
|
-
|
80
|
+
return false;
|
70
81
|
}
|
71
82
|
|
72
|
-
Query *wcq_rewrite(Query *self, IndexReader *ir)
|
83
|
+
static Query *wcq_rewrite(Query *self, IndexReader *ir)
|
73
84
|
{
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
if (first_star == NULL && first_ques == NULL) {
|
84
|
-
q = tq_create(term_clone(term));
|
85
|
-
} else {
|
86
|
-
TermEnum *te;
|
87
|
-
Term prefix_term;
|
88
|
-
char *prefix = NULL;
|
89
|
-
|
90
|
-
char *pattern = (first_ques && (!first_star || (first_star > first_ques)))
|
91
|
-
? first_ques : first_star;
|
92
|
-
|
93
|
-
int prefix_len = (int)(pattern - text);
|
94
|
-
|
95
|
-
prefix_term.field = field;
|
96
|
-
prefix_term.text = (char *)EMPTY_STRING;
|
97
|
-
if (prefix_len > 0) {
|
98
|
-
prefix = ALLOC_N(char, prefix_len + 1);
|
99
|
-
strncpy(prefix, text, prefix_len);
|
100
|
-
prefix_term.text = prefix;
|
101
|
-
prefix_term.text[prefix_len] = '\0';
|
85
|
+
Query *q;
|
86
|
+
const char *field = WCQ(self)->field;
|
87
|
+
const char *pattern = WCQ(self)->pattern;
|
88
|
+
const char *first_star = strchr(pattern, WILD_STRING);
|
89
|
+
const char *first_ques = strchr(pattern, WILD_CHAR);
|
90
|
+
|
91
|
+
if (NULL == first_star && NULL == first_ques) {
|
92
|
+
q = tq_new(field, pattern);
|
93
|
+
q->boost = self->boost;
|
102
94
|
}
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
95
|
+
else {
|
96
|
+
const int field_num = fis_get_field_num(ir->fis, field);
|
97
|
+
q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
|
98
|
+
|
99
|
+
if (field_num >= 0) {
|
100
|
+
TermEnum *te;
|
101
|
+
char prefix[MAX_WORD_SIZE] = "";
|
102
|
+
int prefix_len;
|
103
|
+
|
104
|
+
pattern = (first_ques && (!first_star || first_star > first_ques))
|
105
|
+
? first_ques : first_star;
|
106
|
+
|
107
|
+
prefix_len = (int)(pattern - WCQ(self)->pattern);
|
108
|
+
|
109
|
+
if (prefix_len > 0) {
|
110
|
+
memcpy(prefix, WCQ(self)->pattern, prefix_len);
|
111
|
+
prefix[prefix_len] = '\0';
|
112
|
+
}
|
113
|
+
|
114
|
+
te = ir->terms_from(ir, field_num, prefix);
|
115
|
+
|
116
|
+
if (te != NULL) {
|
117
|
+
const char *term = te->curr_term;
|
118
|
+
const char *pat_term = term + prefix_len;
|
119
|
+
do {
|
120
|
+
if (prefix && strncmp(term, prefix, prefix_len) != 0) {
|
121
|
+
break;
|
122
|
+
}
|
123
|
+
|
124
|
+
if (wc_match(pattern, pat_term)) {
|
125
|
+
multi_tq_add_term(q, term);
|
126
|
+
}
|
127
|
+
} while (te->next(te) != NULL);
|
128
|
+
te->close(te);
|
129
|
+
}
|
117
130
|
}
|
118
|
-
} while ((tb = te->next(te)) != NULL);
|
119
|
-
te->close(te);
|
120
131
|
}
|
121
|
-
free(prefix);
|
122
|
-
}
|
123
132
|
|
124
|
-
|
133
|
+
return q;
|
125
134
|
}
|
126
135
|
|
127
136
|
static void wcq_destroy(Query *self)
|
128
137
|
{
|
129
|
-
|
130
|
-
|
138
|
+
free(WCQ(self)->field);
|
139
|
+
free(WCQ(self)->pattern);
|
140
|
+
q_destroy_i(self);
|
131
141
|
}
|
132
142
|
|
133
|
-
static
|
143
|
+
static ulong wcq_hash(Query *self)
|
134
144
|
{
|
135
|
-
|
145
|
+
return str_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
|
136
146
|
}
|
137
147
|
|
138
148
|
static int wcq_eq(Query *self, Query *o)
|
139
149
|
{
|
140
|
-
|
150
|
+
return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
|
151
|
+
&& (strcmp(WCQ(self)->field, WCQ(o)->field) == 0);
|
141
152
|
}
|
142
153
|
|
143
|
-
Query *
|
154
|
+
Query *wcq_new(const char *field, const char *pattern)
|
144
155
|
{
|
145
|
-
|
156
|
+
Query *self = q_new(WildCardQuery);
|
146
157
|
|
147
|
-
|
158
|
+
WCQ(self)->field = estrdup(field);
|
159
|
+
WCQ(self)->pattern = estrdup(pattern);
|
160
|
+
MTQMaxTerms(self) = WILD_CARD_QUERY_MAX_TERMS;
|
148
161
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
162
|
+
self->type = WILD_CARD_QUERY;
|
163
|
+
self->rewrite = &wcq_rewrite;
|
164
|
+
self->to_s = &wcq_to_s;
|
165
|
+
self->hash = &wcq_hash;
|
166
|
+
self->eq = &wcq_eq;
|
167
|
+
self->destroy_i = &wcq_destroy;
|
168
|
+
self->create_weight_i = &q_create_weight_unsup;
|
156
169
|
|
157
|
-
|
170
|
+
return self;
|
158
171
|
}
|
data/ext/r_analysis.c
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
#include <regex.h>
|
2
|
+
#include <locale.h>
|
3
|
+
#include <st.h>
|
2
4
|
#include "ferret.h"
|
3
5
|
#include "analysis.h"
|
4
|
-
|
6
|
+
|
7
|
+
static VALUE mAnalysis;
|
5
8
|
|
6
9
|
static VALUE cToken;
|
7
10
|
static VALUE cAsciiLetterTokenizer;
|
@@ -27,7 +30,6 @@ static VALUE cStandardAnalyzer;
|
|
27
30
|
static VALUE cPerFieldAnalyzer;
|
28
31
|
static VALUE cRegExpAnalyzer;
|
29
32
|
|
30
|
-
//static VALUE cRegexAnalyzer;
|
31
33
|
static VALUE cTokenStream;
|
32
34
|
|
33
35
|
/* TokenStream Methods */
|
@@ -40,9 +42,16 @@ static ID id_token_stream;
|
|
40
42
|
|
41
43
|
static VALUE object_space;
|
42
44
|
|
43
|
-
extern
|
44
|
-
|
45
|
-
|
45
|
+
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
|
46
|
+
int, struct re_registers *);
|
47
|
+
|
48
|
+
/*
|
49
|
+
static int
|
50
|
+
frt_rb_hash_size(VALUE hash)
|
51
|
+
{
|
52
|
+
return RHASH(hash)->tbl->num_entries;
|
53
|
+
}
|
54
|
+
*/
|
46
55
|
|
47
56
|
/****************************************************************************
|
48
57
|
*
|
@@ -53,18 +62,18 @@ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, in
|
|
53
62
|
static char **
|
54
63
|
get_stopwords(VALUE rstop_words)
|
55
64
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
65
|
+
char **stop_words;
|
66
|
+
int i, len;
|
67
|
+
VALUE rstr;
|
68
|
+
Check_Type(rstop_words, T_ARRAY);
|
69
|
+
len = RARRAY(rstop_words)->len;
|
70
|
+
stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
|
71
|
+
stop_words[len] = NULL;
|
72
|
+
for (i = 0; i < len; i++) {
|
73
|
+
rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
|
74
|
+
stop_words[i] = RSTRING(rstr)->ptr;
|
75
|
+
}
|
76
|
+
return stop_words;
|
68
77
|
}
|
69
78
|
|
70
79
|
/****************************************************************************
|
@@ -74,140 +83,295 @@ get_stopwords(VALUE rstop_words)
|
|
74
83
|
****************************************************************************/
|
75
84
|
|
76
85
|
typedef struct RToken {
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
86
|
+
VALUE text;
|
87
|
+
int start;
|
88
|
+
int end;
|
89
|
+
int pos_inc;
|
81
90
|
} RToken;
|
82
91
|
|
83
92
|
static void
|
84
93
|
frt_token_free(void *p)
|
85
94
|
{
|
86
|
-
|
95
|
+
free(p);
|
87
96
|
}
|
88
|
-
|
97
|
+
|
89
98
|
static void
|
90
99
|
frt_token_mark(void *p)
|
91
100
|
{
|
92
|
-
|
93
|
-
|
101
|
+
RToken *token = (RToken *)p;
|
102
|
+
rb_gc_mark(token->text);
|
94
103
|
}
|
95
104
|
|
96
105
|
static VALUE
|
97
106
|
frt_token_alloc(VALUE klass)
|
98
107
|
{
|
99
|
-
|
108
|
+
return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
|
109
|
+
ALLOC(RToken));
|
100
110
|
}
|
101
111
|
|
102
112
|
static VALUE
|
103
113
|
get_token(Token *tk)
|
104
114
|
{
|
105
|
-
|
115
|
+
RToken *token = ALLOC(RToken);
|
106
116
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
117
|
+
token->text = rb_str_new2(tk->text);
|
118
|
+
token->start = tk->start;
|
119
|
+
token->end = tk->end;
|
120
|
+
token->pos_inc = tk->pos_inc;
|
121
|
+
return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
|
112
122
|
}
|
113
123
|
|
114
124
|
Token *
|
115
125
|
frt_set_token(Token *tk, VALUE rt)
|
116
126
|
{
|
117
|
-
|
127
|
+
RToken *rtk;
|
118
128
|
|
119
|
-
|
129
|
+
if (rt == Qnil) return NULL;
|
120
130
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
131
|
+
Data_Get_Struct(rt, RToken, rtk);
|
132
|
+
tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
|
133
|
+
rtk->start, rtk->end, rtk->pos_inc);
|
134
|
+
return tk;
|
125
135
|
}
|
126
136
|
|
127
|
-
#define GET_TK
|
137
|
+
#define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
|
138
|
+
|
139
|
+
/*
|
140
|
+
* call-seq:
|
141
|
+
* Token.new(text, start, end, pos_inc = 1) -> new Token
|
142
|
+
*
|
143
|
+
* Creates a new token setting the text, start and end offsets of the token
|
144
|
+
* and the position increment for the token.
|
145
|
+
*
|
146
|
+
* The position increment is usually set to 1 but you can set it to other
|
147
|
+
* values as needed. For example, if you have a stop word filter you will be
|
148
|
+
* skipping tokens. Let's say you have the stop words "the" and "and" and you
|
149
|
+
* parse the title "The Old Man and the Sea". The terms "Old", "Man" and
|
150
|
+
* "Sea" will have the position incerements 2, 1 and 3 respectively.
|
151
|
+
*
|
152
|
+
* Another reason you might want to vary the position increment is if you are
|
153
|
+
* adding synonyms to the index. For example let's say you have the synonym
|
154
|
+
* group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
|
155
|
+
* speedy delivery", you'll add "speedy" first with a position increment of 1
|
156
|
+
* and then "fast" and "quick" with position increments of 0 since they are
|
157
|
+
* represented in the same position.
|
158
|
+
*
|
159
|
+
* The offset set values +start+ and +end+ should be byte offsets, not
|
160
|
+
* character offsets. This makes it easy to use those offsets to quickly
|
161
|
+
* access the token in the input string and also to insert highlighting tags
|
162
|
+
* when necessary.
|
163
|
+
*
|
164
|
+
* text:: the main text for the token.
|
165
|
+
* start:: the start offset of the token in bytes.
|
166
|
+
* end:: the end offset of the token in bytes.
|
167
|
+
* pos_inc:: the position increment of a token. See above.
|
168
|
+
* return:: a newly created and assigned Token object
|
169
|
+
*/
|
128
170
|
static VALUE
|
129
171
|
frt_token_init(int argc, VALUE *argv, VALUE self)
|
130
172
|
{
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
173
|
+
RToken *token;
|
174
|
+
VALUE rtext, rstart, rend, rpos_inc, rtype;
|
175
|
+
GET_TK(token, self);
|
176
|
+
token->pos_inc = 1;
|
177
|
+
switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
|
178
|
+
&rend, &rpos_inc, &rtype)) {
|
179
|
+
case 5: /* type gets ignored at this stage */
|
180
|
+
case 4: token->pos_inc = FIX2INT(rpos_inc);
|
181
|
+
}
|
182
|
+
token->text = rb_obj_as_string(rtext);
|
183
|
+
token->start = FIX2INT(rstart);
|
184
|
+
token->end = FIX2INT(rend);
|
185
|
+
return self;
|
142
186
|
}
|
143
187
|
|
188
|
+
/*
|
189
|
+
* call-seq:
|
190
|
+
* token.cmp(other_token) -> bool
|
191
|
+
*
|
192
|
+
* Used to compare two tokens. Token is extended by Comparable so you can
|
193
|
+
* also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
|
194
|
+
*
|
195
|
+
* Tokens are sorted by the position in the text at which they occur, ie
|
196
|
+
* the start offset. If two tokens have the same start offset, (see
|
197
|
+
* pos_inc=) then, they are sorted by the end offset and then
|
198
|
+
* lexically by the token text.
|
199
|
+
*/
|
144
200
|
static VALUE
|
145
201
|
frt_token_cmp(VALUE self, VALUE rother)
|
146
202
|
{
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
} else {
|
156
|
-
if (token->end > other->end) {
|
157
|
-
cmp = 1;
|
158
|
-
} else if (token->end < other->end) {
|
159
|
-
cmp = -1;
|
203
|
+
RToken *token, *other;
|
204
|
+
int cmp;
|
205
|
+
GET_TK(token, self);
|
206
|
+
GET_TK(other, rother);
|
207
|
+
if (token->start > other->start) {
|
208
|
+
cmp = 1;
|
209
|
+
} else if (token->start < other->start) {
|
210
|
+
cmp = -1;
|
160
211
|
} else {
|
161
|
-
|
212
|
+
if (token->end > other->end) {
|
213
|
+
cmp = 1;
|
214
|
+
} else if (token->end < other->end) {
|
215
|
+
cmp = -1;
|
216
|
+
} else {
|
217
|
+
cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
|
218
|
+
}
|
162
219
|
}
|
163
|
-
|
164
|
-
return INT2FIX(cmp);
|
220
|
+
return INT2FIX(cmp);
|
165
221
|
}
|
166
222
|
|
223
|
+
/*
|
224
|
+
* call-seq:
|
225
|
+
* token.text -> text
|
226
|
+
*
|
227
|
+
* Returns the text that this token represents
|
228
|
+
*/
|
167
229
|
static VALUE
|
168
230
|
frt_token_get_text(VALUE self)
|
169
231
|
{
|
170
|
-
|
171
|
-
|
232
|
+
RToken *token;
|
233
|
+
GET_TK(token, self);
|
234
|
+
return token->text;
|
172
235
|
}
|
173
236
|
|
237
|
+
/*
|
238
|
+
* call-seq:
|
239
|
+
* token.text = text -> text
|
240
|
+
*
|
241
|
+
* Set the text for this token.
|
242
|
+
*/
|
174
243
|
static VALUE
|
175
244
|
frt_token_set_text(VALUE self, VALUE rtext)
|
176
245
|
{
|
177
|
-
|
178
|
-
|
179
|
-
|
246
|
+
RToken *token;
|
247
|
+
GET_TK(token, self);
|
248
|
+
token->text = rtext;
|
249
|
+
return rtext;
|
180
250
|
}
|
181
251
|
|
252
|
+
/*
|
253
|
+
* call-seq:
|
254
|
+
* token.start -> integer
|
255
|
+
*
|
256
|
+
* Start byte-position of this token
|
257
|
+
*/
|
182
258
|
static VALUE
|
183
259
|
frt_token_get_start_offset(VALUE self)
|
184
260
|
{
|
185
|
-
|
186
|
-
|
261
|
+
RToken *token;
|
262
|
+
GET_TK(token, self);
|
263
|
+
return INT2FIX(token->start);
|
187
264
|
}
|
188
265
|
|
266
|
+
/*
|
267
|
+
* call-seq:
|
268
|
+
* token.end -> integer
|
269
|
+
*
|
270
|
+
* End byte-position of this token
|
271
|
+
*/
|
189
272
|
static VALUE
|
190
273
|
frt_token_get_end_offset(VALUE self)
|
191
274
|
{
|
192
|
-
|
193
|
-
|
275
|
+
RToken *token;
|
276
|
+
GET_TK(token, self);
|
277
|
+
return INT2FIX(token->end);
|
194
278
|
}
|
195
279
|
|
280
|
+
/*
|
281
|
+
* call-seq:
|
282
|
+
* token.pos_inc -> integer
|
283
|
+
*
|
284
|
+
* Position Increment for this token
|
285
|
+
*/
|
196
286
|
static VALUE
|
197
287
|
frt_token_get_pos_inc(VALUE self)
|
198
288
|
{
|
199
|
-
|
200
|
-
|
289
|
+
RToken *token;
|
290
|
+
GET_TK(token, self);
|
291
|
+
return INT2FIX(token->pos_inc);
|
201
292
|
}
|
202
293
|
|
294
|
+
/*
|
295
|
+
* call-seq:
|
296
|
+
* token.start = start -> integer
|
297
|
+
*
|
298
|
+
* Set start byte-position of this token
|
299
|
+
*/
|
300
|
+
static VALUE
|
301
|
+
frt_token_set_start_offset(VALUE self, VALUE rstart)
|
302
|
+
{
|
303
|
+
RToken *token;
|
304
|
+
GET_TK(token, self);
|
305
|
+
token->start = FIX2INT(rstart);
|
306
|
+
return rstart;
|
307
|
+
}
|
308
|
+
|
309
|
+
/*
|
310
|
+
* call-seq:
|
311
|
+
* token.end = end -> integer
|
312
|
+
*
|
313
|
+
* Set end byte-position of this token
|
314
|
+
*/
|
315
|
+
static VALUE
|
316
|
+
frt_token_set_end_offset(VALUE self, VALUE rend)
|
317
|
+
{
|
318
|
+
RToken *token;
|
319
|
+
GET_TK(token, self);
|
320
|
+
token->end = FIX2INT(rend);
|
321
|
+
return rend;
|
322
|
+
}
|
323
|
+
|
324
|
+
/*
|
325
|
+
* call-seq:
|
326
|
+
* token.pos_inc = pos_inc -> integer
|
327
|
+
*
|
328
|
+
* Set the position increment. This determines the position of this token
|
329
|
+
* relative to the previous Token in a TokenStream, used in phrase
|
330
|
+
* searching.
|
331
|
+
*
|
332
|
+
* The default value is 1.
|
333
|
+
*
|
334
|
+
* Some common uses for this are:
|
335
|
+
*
|
336
|
+
* * Set it to zero to put multiple terms in the same position. This is
|
337
|
+
* useful if, e.g., a word has multiple stems. Searches for phrases
|
338
|
+
* including either stem will match. In this case, all but the first
|
339
|
+
* stem's increment should be set to zero: the increment of the first
|
340
|
+
* instance should be one. Repeating a token with an increment of zero
|
341
|
+
* can also be used to boost the scores of matches on that token.
|
342
|
+
*
|
343
|
+
* * Set it to values greater than one to inhibit exact phrase matches.
|
344
|
+
* If, for example, one does not want phrases to match across removed
|
345
|
+
* stop words, then one could build a stop word filter that removes stop
|
346
|
+
* words and also sets the increment to the number of stop words removed
|
347
|
+
* before each non-stop word. Then exact phrase queries will only match
|
348
|
+
* when the terms occur with no intervening stop words.
|
349
|
+
*
|
350
|
+
*/
|
351
|
+
static VALUE
|
352
|
+
frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
|
353
|
+
{
|
354
|
+
RToken *token;
|
355
|
+
GET_TK(token, self);
|
356
|
+
token->pos_inc = FIX2INT(rpos_inc);
|
357
|
+
return rpos_inc;
|
358
|
+
}
|
359
|
+
|
360
|
+
/*
|
361
|
+
* call-seq:
|
362
|
+
* token.to_s -> token_str
|
363
|
+
*
|
364
|
+
* Return a string representation of the token
|
365
|
+
*/
|
203
366
|
static VALUE
|
204
367
|
frt_token_to_s(VALUE self)
|
205
368
|
{
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
369
|
+
RToken *token;
|
370
|
+
GET_TK(token, self);
|
371
|
+
char *buf = alloca(RSTRING(token->text)->len + 80);
|
372
|
+
sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
|
373
|
+
token->end, token->pos_inc);
|
374
|
+
return rb_str_new2(buf);
|
211
375
|
}
|
212
376
|
|
213
377
|
/****************************************************************************
|
@@ -216,143 +380,210 @@ frt_token_to_s(VALUE self)
|
|
216
380
|
*
|
217
381
|
****************************************************************************/
|
218
382
|
|
383
|
+
#define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
|
384
|
+
|
219
385
|
static void
|
220
386
|
frt_ts_mark(void *p)
|
221
387
|
{
|
222
|
-
|
223
|
-
|
224
|
-
if (ts->sub_ts) frt_gc_mark(&ts->sub_ts);
|
388
|
+
TokenStream *ts = (TokenStream *)p;
|
389
|
+
if (ts->text) frt_gc_mark(&ts->text);
|
225
390
|
}
|
226
391
|
|
227
392
|
static void
|
228
393
|
frt_ts_free(TokenStream *ts)
|
229
394
|
{
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
395
|
+
if (object_get(&ts->text) != Qnil) {
|
396
|
+
object_del(&ts->text);
|
397
|
+
}
|
398
|
+
object_del(ts);
|
399
|
+
ts_deref(ts);
|
234
400
|
}
|
235
401
|
|
402
|
+
static void frt_rets_free(TokenStream *ts);
|
403
|
+
static void frt_rets_mark(TokenStream *ts);
|
404
|
+
static Token *rets_next(TokenStream *ts);
|
405
|
+
|
236
406
|
static VALUE
|
237
|
-
|
238
|
-
{
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
407
|
+
get_rb_token_stream(TokenStream *ts)
|
408
|
+
{
|
409
|
+
VALUE rts = object_get(ts);
|
410
|
+
if (rts == Qnil) {
|
411
|
+
if (ts->next == &rets_next) {
|
412
|
+
rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
|
413
|
+
&frt_rets_free, ts);
|
414
|
+
} else {
|
415
|
+
rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
|
416
|
+
&frt_ts_free, ts);
|
417
|
+
}
|
418
|
+
object_add(ts, rts);
|
419
|
+
}
|
420
|
+
return rts;
|
245
421
|
}
|
246
422
|
|
247
423
|
static inline VALUE
|
248
424
|
get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
249
425
|
{
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
426
|
+
StringValue(rstr);
|
427
|
+
ts->reset(ts, RSTRING(rstr)->ptr);
|
428
|
+
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
429
|
+
object_add(&ts->text, rstr);
|
430
|
+
object_add(ts, self);
|
431
|
+
return self;
|
256
432
|
}
|
257
433
|
|
434
|
+
/*
|
435
|
+
* call-seq:
|
436
|
+
* token_stream.text = text -> text
|
437
|
+
*
|
438
|
+
* Set the text attribute of the TokenStream to the text you wish to be
|
439
|
+
* tokenized. For example, you may do this;
|
440
|
+
*
|
441
|
+
* token_stream.text = File.read(file_name)
|
442
|
+
*/
|
258
443
|
static VALUE
|
259
444
|
frt_ts_set_text(VALUE self, VALUE rtext)
|
260
445
|
{
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
446
|
+
TokenStream *ts;
|
447
|
+
Data_Get_Struct(self, TokenStream, ts);
|
448
|
+
StringValue(rtext);
|
449
|
+
ts->reset(ts, RSTRING(rtext)->ptr);
|
450
|
+
object_set(&ts->text, rtext);
|
266
451
|
|
267
|
-
|
452
|
+
return rtext;
|
268
453
|
}
|
269
454
|
|
455
|
+
/*
|
456
|
+
* call-seq:
|
457
|
+
* token_stream.text = text -> text
|
458
|
+
*
|
459
|
+
* Return the text that the TokenStream is tokenizing
|
460
|
+
*/
|
270
461
|
static VALUE
|
271
462
|
frt_ts_get_text(VALUE self)
|
272
463
|
{
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
464
|
+
VALUE rtext = Qnil;
|
465
|
+
TokenStream *ts;
|
466
|
+
Data_Get_Struct(self, TokenStream, ts);
|
467
|
+
if (ts->text) {
|
468
|
+
if ((rtext = object_get(&ts->text)) == Qnil) {
|
469
|
+
rtext = rb_str_new2(ts->text);
|
470
|
+
object_set(&ts->text, rtext);
|
471
|
+
}
|
472
|
+
}
|
473
|
+
return rtext;
|
283
474
|
}
|
284
475
|
|
476
|
+
/*
|
477
|
+
* call-seq:
|
478
|
+
* token_stream.next -> token
|
479
|
+
*
|
480
|
+
* Return the next token from the TokenStream or nil if there are no more
|
481
|
+
* tokens.
|
482
|
+
*/
|
285
483
|
static VALUE
|
286
484
|
frt_ts_next(VALUE self)
|
287
485
|
{
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
486
|
+
TokenStream *ts;
|
487
|
+
GET_TS(ts, self);
|
488
|
+
Token *next = ts->next(ts);
|
489
|
+
if (next == NULL) {
|
490
|
+
return Qnil;
|
491
|
+
}
|
293
492
|
|
294
|
-
|
493
|
+
return get_token(next);
|
295
494
|
}
|
296
495
|
|
496
|
+
/****************************************************************************
|
497
|
+
* TokenFilter
|
498
|
+
****************************************************************************/
|
499
|
+
|
500
|
+
#define TkFilt(filter) ((TokenFilter *)(filter))
|
501
|
+
|
502
|
+
static void
|
503
|
+
frt_tf_mark(void *p)
|
504
|
+
{
|
505
|
+
TokenStream *ts = (TokenStream *)p;
|
506
|
+
if (TkFilt(ts)->sub_ts) {
|
507
|
+
frt_gc_mark(&TkFilt(ts)->sub_ts);
|
508
|
+
}
|
509
|
+
}
|
510
|
+
|
511
|
+
static void
|
512
|
+
frt_tf_free(TokenStream *ts)
|
513
|
+
{
|
514
|
+
if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
|
515
|
+
object_del(&TkFilt(ts)->sub_ts);
|
516
|
+
}
|
517
|
+
object_del(ts);
|
518
|
+
ts_deref(ts);
|
519
|
+
}
|
520
|
+
|
521
|
+
|
297
522
|
/****************************************************************************
|
298
523
|
* CWrappedTokenStream
|
299
524
|
****************************************************************************/
|
300
525
|
|
526
|
+
#define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
|
527
|
+
#define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
|
528
|
+
|
529
|
+
typedef struct CWrappedTokenStream {
|
530
|
+
CachedTokenStream super;
|
531
|
+
VALUE rts;
|
532
|
+
} CWrappedTokenStream;
|
533
|
+
|
301
534
|
static void
|
302
|
-
|
535
|
+
cwrts_destroy_i(TokenStream *ts)
|
303
536
|
{
|
304
|
-
|
305
|
-
|
306
|
-
|
537
|
+
rb_hash_delete(object_space, LONG2NUM(CWTS(ts)->rts));
|
538
|
+
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
539
|
+
free(ts);
|
307
540
|
}
|
308
541
|
|
309
542
|
static Token *
|
310
543
|
cwrts_next(TokenStream *ts)
|
311
544
|
{
|
312
|
-
|
313
|
-
|
314
|
-
return frt_set_token(ts->token, rtoken);
|
545
|
+
VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
|
546
|
+
return frt_set_token(&(CachedTS(ts)->token), rtoken);
|
315
547
|
}
|
316
548
|
|
317
|
-
static
|
549
|
+
static TokenStream *
|
318
550
|
cwrts_reset(TokenStream *ts, char *text)
|
319
551
|
{
|
320
|
-
|
321
|
-
|
322
|
-
|
552
|
+
ts->t = ts->text = text;
|
553
|
+
rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
|
554
|
+
return ts;
|
323
555
|
}
|
324
556
|
|
325
|
-
static
|
326
|
-
cwrts_clone_i(TokenStream *orig_ts
|
557
|
+
static TokenStream *
|
558
|
+
cwrts_clone_i(TokenStream *orig_ts)
|
327
559
|
{
|
328
|
-
|
329
|
-
|
560
|
+
TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
|
561
|
+
CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
|
562
|
+
return new_ts;
|
330
563
|
}
|
331
564
|
|
332
565
|
static TokenStream *
|
333
566
|
frt_get_cwrapped_rts(VALUE rts)
|
334
567
|
{
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
}
|
355
|
-
return ts;
|
568
|
+
TokenStream *ts;
|
569
|
+
switch (TYPE(rts)) {
|
570
|
+
case T_DATA:
|
571
|
+
GET_TS(ts, rts);
|
572
|
+
REF(ts);
|
573
|
+
break;
|
574
|
+
default:
|
575
|
+
ts = ts_new(CWrappedTokenStream);
|
576
|
+
CWTS(ts)->rts = rts;
|
577
|
+
ts->next = &cwrts_next;
|
578
|
+
ts->reset = &cwrts_reset;
|
579
|
+
ts->clone_i = &cwrts_clone_i;
|
580
|
+
ts->destroy_i = &cwrts_destroy_i;
|
581
|
+
/* prevent from being garbage collected */
|
582
|
+
rb_hash_aset(object_space, LONG2NUM(rts), rts);
|
583
|
+
ts->ref_cnt = 1;
|
584
|
+
break;
|
585
|
+
}
|
586
|
+
return ts;
|
356
587
|
}
|
357
588
|
|
358
589
|
/****************************************************************************
|
@@ -364,165 +595,181 @@ frt_get_cwrapped_rts(VALUE rts)
|
|
364
595
|
#define ALPHA "[-_[:alpha:]]"
|
365
596
|
#define ALNUM "[-_[:alnum:]]"
|
366
597
|
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
598
|
+
#define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
|
599
|
+
|
600
|
+
static const char *TOKEN_RE =
|
601
|
+
ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
|
602
|
+
"(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
|
603
|
+
"|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
|
371
604
|
"|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
|
372
605
|
"|(\\.\\w+)+"
|
373
606
|
"|"
|
374
|
-
|
607
|
+
")";
|
375
608
|
static VALUE rtoken_re;
|
376
609
|
|
377
610
|
typedef struct RegExpTokenStream {
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
611
|
+
CachedTokenStream super;
|
612
|
+
VALUE rtext;
|
613
|
+
VALUE regex;
|
614
|
+
VALUE proc;
|
615
|
+
int curr_ind;
|
382
616
|
} RegExpTokenStream;
|
383
617
|
|
384
618
|
static void
|
385
|
-
|
619
|
+
rets_destroy_i(TokenStream *ts)
|
386
620
|
{
|
387
|
-
|
388
|
-
free(ts->data);
|
389
|
-
free(ts->token);
|
390
|
-
free(ts);
|
621
|
+
free(ts);
|
391
622
|
}
|
392
623
|
|
393
624
|
static void
|
394
625
|
frt_rets_free(TokenStream *ts)
|
395
626
|
{
|
396
|
-
|
397
|
-
|
627
|
+
if (object_get(&ts->text) != Qnil) {
|
628
|
+
object_del(&ts->text);
|
629
|
+
}
|
630
|
+
object_del(ts);
|
631
|
+
ts_deref(ts);
|
398
632
|
}
|
399
633
|
|
400
634
|
static void
|
401
635
|
frt_rets_mark(TokenStream *ts)
|
402
636
|
{
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
637
|
+
if (ts->text) frt_gc_mark(&ts->text);
|
638
|
+
rb_gc_mark(RETS(ts)->rtext);
|
639
|
+
rb_gc_mark(RETS(ts)->regex);
|
640
|
+
rb_gc_mark(RETS(ts)->proc);
|
407
641
|
}
|
408
642
|
|
643
|
+
/*
|
644
|
+
* call-seq:
|
645
|
+
* tokenizer.text = text -> text
|
646
|
+
*
|
647
|
+
* Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
|
648
|
+
* tokenize the text from the beginning.
|
649
|
+
*/
|
409
650
|
static VALUE
|
410
651
|
frt_rets_set_text(VALUE self, VALUE rtext)
|
411
652
|
{
|
412
|
-
|
413
|
-
|
414
|
-
|
653
|
+
TokenStream *ts;
|
654
|
+
GET_TS(ts, self);
|
655
|
+
|
656
|
+
StringValue(rtext);
|
657
|
+
RETS(ts)->rtext = rtext;
|
658
|
+
RETS(ts)->curr_ind = 0;
|
415
659
|
|
416
|
-
|
417
|
-
rets = (RegExpTokenStream *)ts->data;
|
418
|
-
rets->rtext = rtext;
|
419
|
-
rets->curr_ind = 0;
|
420
|
-
|
421
|
-
return rtext;
|
660
|
+
return rtext;
|
422
661
|
}
|
423
662
|
|
663
|
+
/*
|
664
|
+
* call-seq:
|
665
|
+
* tokenizer.text = text -> text
|
666
|
+
*
|
667
|
+
* Get the text being tokenized by the tokenizer.
|
668
|
+
*/
|
424
669
|
static VALUE
|
425
670
|
frt_rets_get_text(VALUE self)
|
426
671
|
{
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
rets = (RegExpTokenStream *)ts->data;
|
431
|
-
return rets->rtext;
|
672
|
+
TokenStream *ts;
|
673
|
+
GET_TS(ts, self);
|
674
|
+
return RETS(ts)->rtext;
|
432
675
|
}
|
433
676
|
|
434
677
|
static Token *
|
435
678
|
rets_next(TokenStream *ts)
|
436
679
|
{
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
680
|
+
static struct re_registers regs;
|
681
|
+
int ret, beg, end;
|
682
|
+
struct RString *rtext = RSTRING(RETS(ts)->rtext);
|
683
|
+
Check_Type(RETS(ts)->regex, T_REGEXP);
|
684
|
+
ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
|
685
|
+
rtext->ptr, rtext->len,
|
686
|
+
RETS(ts)->curr_ind, rtext->len - RETS(ts)->curr_ind,
|
687
|
+
®s);
|
688
|
+
|
689
|
+
if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
|
690
|
+
if (ret < 0) return NULL; /* not matched */
|
691
|
+
|
692
|
+
beg = regs.beg[0];
|
693
|
+
RETS(ts)->curr_ind = end = regs.end[0];
|
694
|
+
if (NIL_P(RETS(ts)->proc)) {
|
695
|
+
return tk_set(&(CachedTS(ts)->token), rtext->ptr + beg, end - beg,
|
696
|
+
beg, end, 1);
|
697
|
+
} else {
|
698
|
+
VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
|
699
|
+
rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
|
700
|
+
return tk_set(&(CachedTS(ts)->token), RSTRING(rtok)->ptr,
|
701
|
+
RSTRING(rtok)->len, beg, end, 1);
|
702
|
+
}
|
459
703
|
}
|
460
704
|
|
461
|
-
static
|
705
|
+
static TokenStream *
|
462
706
|
rets_reset(TokenStream *ts, char *text)
|
463
707
|
{
|
464
|
-
|
465
|
-
|
466
|
-
|
708
|
+
RETS(ts)->rtext = rb_str_new2(text);
|
709
|
+
RETS(ts)->curr_ind = 0;
|
710
|
+
return ts;
|
467
711
|
}
|
468
712
|
|
469
|
-
|
470
|
-
rets_clone_i(TokenStream *orig_ts
|
713
|
+
static TokenStream *
|
714
|
+
rets_clone_i(TokenStream *orig_ts)
|
471
715
|
{
|
472
|
-
|
473
|
-
|
474
|
-
memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
|
475
|
-
new_ts->data = new_rets;
|
716
|
+
TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
|
717
|
+
return ts;
|
476
718
|
}
|
477
719
|
|
478
720
|
static TokenStream *
|
479
|
-
|
721
|
+
rets_new(VALUE rtext, VALUE regex, VALUE proc)
|
480
722
|
{
|
481
|
-
|
482
|
-
TokenStream *ts;
|
723
|
+
TokenStream *ts;
|
483
724
|
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
}
|
504
|
-
|
505
|
-
ts->data = rets;
|
725
|
+
if (rtext != Qnil) {
|
726
|
+
rtext = StringValue(rtext);
|
727
|
+
}
|
728
|
+
ts = ts_new(RegExpTokenStream);
|
729
|
+
ts->reset = &rets_reset;
|
730
|
+
ts->next = &rets_next;
|
731
|
+
ts->clone_i = &rets_clone_i;
|
732
|
+
ts->destroy_i = &rets_destroy_i;
|
733
|
+
|
734
|
+
RETS(ts)->curr_ind = 0;
|
735
|
+
RETS(ts)->rtext = rtext;
|
736
|
+
RETS(ts)->proc = proc;
|
737
|
+
|
738
|
+
if (NIL_P(regex)) {
|
739
|
+
RETS(ts)->regex = rtoken_re;
|
740
|
+
} else {
|
741
|
+
Check_Type(regex, T_REGEXP);
|
742
|
+
RETS(ts)->regex = regex;
|
743
|
+
}
|
506
744
|
|
507
|
-
|
745
|
+
return ts;
|
508
746
|
}
|
509
747
|
|
748
|
+
/*
|
749
|
+
* call-seq:
|
750
|
+
* RegExpTokenizer.new(input, /[[:alpha:]]+/)
|
751
|
+
*
|
752
|
+
* Create a new tokenizer based on a regular expression
|
753
|
+
*
|
754
|
+
* input:: text to tokenizer
|
755
|
+
* regexp:: regular expression used to recognize tokens in the input
|
756
|
+
*/
|
510
757
|
static VALUE
|
511
758
|
frt_rets_init(int argc, VALUE *argv, VALUE self)
|
512
759
|
{
|
513
|
-
|
514
|
-
|
760
|
+
VALUE rtext, regex, proc;
|
761
|
+
TokenStream *ts;
|
515
762
|
|
516
|
-
|
763
|
+
rb_scan_args(argc, argv, "11&", &rtext, ®ex, &proc);
|
517
764
|
|
518
|
-
|
765
|
+
ts = rets_new(rtext, regex, proc);
|
519
766
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
767
|
+
Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
|
768
|
+
object_add(ts, self);
|
769
|
+
/* no need to add to object space as it is going to ruby space
|
770
|
+
* rb_hash_aset(object_space, LONG2NUM((long)self), self);
|
771
|
+
*/
|
772
|
+
return self;
|
526
773
|
}
|
527
774
|
|
528
775
|
/****************************************************************************
|
@@ -530,47 +777,92 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
|
|
530
777
|
****************************************************************************/
|
531
778
|
|
532
779
|
#define TS_ARGS(dflt) \
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
780
|
+
bool lower;\
|
781
|
+
VALUE rlower, rstr;\
|
782
|
+
rb_scan_args(argc, argv, "11", &rstr, &rlower);\
|
783
|
+
lower = (argc ? RTEST(rlower) : dflt)
|
784
|
+
|
785
|
+
/*
|
786
|
+
* call-seq:
|
787
|
+
* AsciiLetterTokenizer.new() -> tokenizer
|
788
|
+
*
|
789
|
+
* Create a new AsciiLetterTokenizer
|
790
|
+
*/
|
538
791
|
static VALUE
|
539
792
|
frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
540
793
|
{
|
541
|
-
|
794
|
+
return get_wrapped_ts(self, rstr, letter_tokenizer_new());
|
542
795
|
}
|
543
796
|
|
797
|
+
/*
|
798
|
+
* call-seq:
|
799
|
+
* LetterTokenizer.new(lower = true) -> tokenizer
|
800
|
+
*
|
801
|
+
* Create a new LetterTokenizer which optionally downcases tokens. Downcasing
|
802
|
+
* is done according the the current locale.
|
803
|
+
*
|
804
|
+
* lower:: set to false if you don't wish to downcase tokens
|
805
|
+
*/
|
544
806
|
static VALUE
|
545
807
|
frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
546
808
|
{
|
547
|
-
|
548
|
-
|
809
|
+
TS_ARGS(false);
|
810
|
+
return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
|
549
811
|
}
|
550
812
|
|
813
|
+
/*
|
814
|
+
* call-seq:
|
815
|
+
* AsciiWhiteSpaceTokenizer.new() -> tokenizer
|
816
|
+
*
|
817
|
+
* Create a new AsciiWhiteSpaceTokenizer
|
818
|
+
*/
|
551
819
|
static VALUE
|
552
820
|
frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
553
821
|
{
|
554
|
-
|
822
|
+
return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
|
555
823
|
}
|
556
824
|
|
825
|
+
/*
|
826
|
+
* call-seq:
|
827
|
+
* WhiteSpaceTokenizer.new(lower = true) -> tokenizer
|
828
|
+
*
|
829
|
+
* Create a new WhiteSpaceTokenizer which optionally downcases tokens.
|
830
|
+
* Downcasing is done according the the current locale.
|
831
|
+
*
|
832
|
+
* lower:: set to false if you don't wish to downcase tokens
|
833
|
+
*/
|
557
834
|
static VALUE
|
558
835
|
frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
559
836
|
{
|
560
|
-
|
561
|
-
|
837
|
+
TS_ARGS(false);
|
838
|
+
return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
|
562
839
|
}
|
563
840
|
|
841
|
+
/*
|
842
|
+
* call-seq:
|
843
|
+
* AsciiStandardTokenizer.new() -> tokenizer
|
844
|
+
*
|
845
|
+
* Create a new AsciiStandardTokenizer
|
846
|
+
*/
|
564
847
|
static VALUE
|
565
848
|
frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
566
849
|
{
|
567
|
-
|
850
|
+
return get_wrapped_ts(self, rstr, standard_tokenizer_new());
|
568
851
|
}
|
569
852
|
|
853
|
+
/*
|
854
|
+
* call-seq:
|
855
|
+
* StandardTokenizer.new(lower = true) -> tokenizer
|
856
|
+
*
|
857
|
+
* Create a new StandardTokenizer which optionally downcases tokens.
|
858
|
+
* Downcasing is done according the the current locale.
|
859
|
+
*
|
860
|
+
* lower:: set to false if you don't wish to downcase tokens
|
861
|
+
*/
|
570
862
|
static VALUE
|
571
863
|
frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
572
864
|
{
|
573
|
-
|
865
|
+
return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
|
574
866
|
}
|
575
867
|
|
576
868
|
/****************************************************************************
|
@@ -578,71 +870,114 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
578
870
|
****************************************************************************/
|
579
871
|
|
580
872
|
|
873
|
+
/*
|
874
|
+
* call-seq:
|
875
|
+
* AsciiLowerCaseFilter.new(token_stream) -> token_stream
|
876
|
+
*
|
877
|
+
* Create an AsciiLowerCaseFilter which normalizes a token's text to
|
878
|
+
* lowercase but only for Ascii characters. For other characters use
|
879
|
+
* LowerCaseFilter.
|
880
|
+
*/
|
581
881
|
static VALUE
|
582
882
|
frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
583
883
|
{
|
584
|
-
|
585
|
-
|
586
|
-
|
884
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
885
|
+
ts = lowercase_filter_new(ts);
|
886
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
587
887
|
|
588
|
-
|
589
|
-
|
590
|
-
|
888
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
889
|
+
object_add(ts, self);
|
890
|
+
return self;
|
591
891
|
}
|
592
892
|
|
893
|
+
/*
|
894
|
+
* call-seq:
|
895
|
+
* LowerCaseFilter.new(token_stream) -> token_stream
|
896
|
+
*
|
897
|
+
* Create an LowerCaseFilter which normalizes a token's text to
|
898
|
+
* lowercase based on the current locale.
|
899
|
+
*/
|
593
900
|
static VALUE
|
594
901
|
frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
595
902
|
{
|
596
|
-
|
597
|
-
|
598
|
-
|
903
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
904
|
+
ts = mb_lowercase_filter_new(ts);
|
905
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
599
906
|
|
600
|
-
|
601
|
-
|
602
|
-
|
907
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
908
|
+
object_add(ts, self);
|
909
|
+
return self;
|
603
910
|
}
|
604
911
|
|
912
|
+
/*
|
913
|
+
* call-seq:
|
914
|
+
* StopFilter.new(token_stream) -> token_stream
|
915
|
+
* StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
|
916
|
+
*
|
917
|
+
* Create an StopFilter which removes *stop-words* from a TokenStream. You can
|
918
|
+
* optionally specify the stopwords you wish to have removed.
|
919
|
+
*
|
920
|
+
* token_stream:: TokenStream to be filtered
|
921
|
+
* stop_words:: Array of *stop-words* you wish to be filtered out. This
|
922
|
+
* defaults to a list of English stop-words. The
|
923
|
+
* Ferret::Analysis contains a number of stop-word lists.
|
924
|
+
*/
|
605
925
|
static VALUE
|
606
926
|
frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
607
927
|
{
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
928
|
+
VALUE rsub_ts, rstop_words;
|
929
|
+
TokenStream *ts;
|
930
|
+
rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
|
931
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
932
|
+
if (rstop_words != Qnil) {
|
933
|
+
char **stop_words = get_stopwords(rstop_words);
|
934
|
+
ts = stop_filter_new_with_words(ts, (const char **)stop_words);
|
615
935
|
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
936
|
+
free(stop_words);
|
937
|
+
} else {
|
938
|
+
ts = stop_filter_new(ts);
|
939
|
+
}
|
940
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
621
941
|
|
622
|
-
|
623
|
-
|
624
|
-
|
942
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
943
|
+
object_add(ts, self);
|
944
|
+
return self;
|
625
945
|
}
|
626
946
|
|
947
|
+
/*
|
948
|
+
* call-seq:
|
949
|
+
* StemFilter.new(token_stream) -> token_stream
|
950
|
+
* StemFilter.new(token_stream,
|
951
|
+
* algorithm="english",
|
952
|
+
* encoding=locale-specific) -> token_stream
|
953
|
+
*
|
954
|
+
* Create an StemFilter which uses a snowball stemmer (thankyou Martin
|
955
|
+
* Porter) to stem words. You can optionally specify the algorithm (default:
|
956
|
+
* "english") and encoding (default: "UTF-8").
|
957
|
+
*
|
958
|
+
* token_stream:: TokenStream to be filtered
|
959
|
+
* algorithm:: The algorithm (or language) to use
|
960
|
+
* encoding:: The encoding of the data (default: "UTF-8")
|
961
|
+
*/
|
627
962
|
static VALUE
|
628
963
|
frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
629
964
|
{
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
965
|
+
VALUE rsub_ts, ralgorithm, rcharenc;
|
966
|
+
char *algorithm = "english";
|
967
|
+
char *charenc = NULL;
|
968
|
+
TokenStream *ts;
|
969
|
+
rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
|
970
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
971
|
+
switch (argc) {
|
972
|
+
case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
|
973
|
+
case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
|
974
|
+
}
|
975
|
+
ts = stem_filter_new(ts, algorithm, charenc);
|
976
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
642
977
|
|
643
|
-
|
644
|
-
|
645
|
-
|
978
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
979
|
+
object_add(ts, self);
|
980
|
+
return self;
|
646
981
|
}
|
647
982
|
|
648
983
|
/****************************************************************************
|
@@ -655,216 +990,327 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
655
990
|
* CWrappedAnalyzer Methods
|
656
991
|
****************************************************************************/
|
657
992
|
|
993
|
+
#define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
|
994
|
+
|
995
|
+
#define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
|
996
|
+
typedef struct CWrappedAnalyzer
|
997
|
+
{
|
998
|
+
Analyzer super;
|
999
|
+
VALUE ranalyzer;
|
1000
|
+
} CWrappedAnalyzer;
|
1001
|
+
|
658
1002
|
static void
|
659
|
-
|
1003
|
+
cwa_destroy_i(Analyzer *a)
|
660
1004
|
{
|
661
|
-
|
662
|
-
|
1005
|
+
rb_hash_delete(object_space, LONG2NUM(CWA(a)->ranalyzer));
|
1006
|
+
/*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
|
1007
|
+
free(a);
|
663
1008
|
}
|
664
1009
|
|
665
1010
|
static TokenStream *
|
666
1011
|
cwa_get_ts(Analyzer *a, char *field, char *text)
|
667
1012
|
{
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
return frt_get_cwrapped_rts(rts);
|
1013
|
+
VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1014
|
+
rb_str_new2(field), rb_str_new2(text));
|
1015
|
+
return frt_get_cwrapped_rts(rts);
|
672
1016
|
}
|
673
1017
|
|
674
1018
|
Analyzer *
|
675
|
-
frt_get_cwrapped_analyzer(ranalyzer)
|
676
|
-
{
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
1019
|
+
frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
1020
|
+
{
|
1021
|
+
Analyzer *a = NULL;
|
1022
|
+
switch (TYPE(ranalyzer)) {
|
1023
|
+
case T_DATA:
|
1024
|
+
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1025
|
+
REF(a);
|
1026
|
+
break;
|
1027
|
+
default:
|
1028
|
+
a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
|
1029
|
+
a->destroy_i = &cwa_destroy_i;
|
1030
|
+
a->get_ts = &cwa_get_ts;
|
1031
|
+
a->ref_cnt = 1;
|
1032
|
+
((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
|
1033
|
+
/* prevent from being garbage collected */
|
1034
|
+
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
1035
|
+
break;
|
1036
|
+
}
|
1037
|
+
return a;
|
690
1038
|
}
|
691
1039
|
|
692
1040
|
static void
|
693
1041
|
frt_analyzer_free(Analyzer *a)
|
694
1042
|
{
|
695
|
-
|
696
|
-
|
1043
|
+
object_del(a);
|
1044
|
+
a_deref(a);
|
697
1045
|
}
|
698
1046
|
|
699
1047
|
VALUE
|
700
1048
|
frt_get_analyzer(Analyzer *a)
|
701
1049
|
{
|
702
|
-
|
703
|
-
|
704
|
-
|
1050
|
+
VALUE self = Qnil;
|
1051
|
+
if (a) {
|
1052
|
+
self = object_get(a);
|
1053
|
+
if (self == Qnil) {
|
1054
|
+
self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
|
1055
|
+
REF(a);
|
1056
|
+
object_add(a, self);
|
1057
|
+
}
|
1058
|
+
}
|
1059
|
+
return self;
|
705
1060
|
}
|
706
1061
|
|
1062
|
+
/*
|
1063
|
+
* call-seq:
|
1064
|
+
* analyzer.token_stream(field_name, input) -> token_stream
|
1065
|
+
*
|
1066
|
+
* Create a new TokenStream to tokenize +input+. The TokenStream created may
|
1067
|
+
* also depend on the +field_name+. Although this parameter is typically
|
1068
|
+
* ignored.
|
1069
|
+
*
|
1070
|
+
* field_name:: name of the field to be tokenized
|
1071
|
+
* input:: data from the field to be tokenized
|
1072
|
+
*/
|
707
1073
|
static VALUE
|
708
1074
|
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
709
1075
|
{
|
710
|
-
|
711
|
-
|
1076
|
+
TokenStream *ts;
|
1077
|
+
Analyzer *a;
|
1078
|
+
GET_A(a, self);
|
1079
|
+
|
1080
|
+
StringValue(rfield);
|
1081
|
+
StringValue(rstring);
|
712
1082
|
|
713
|
-
|
714
|
-
rstring = rb_obj_as_string(rstring);
|
715
|
-
|
716
|
-
ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
|
1083
|
+
ts = a_get_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
|
717
1084
|
|
718
|
-
|
719
|
-
|
720
|
-
|
1085
|
+
/* Make sure that there is no entry already */
|
1086
|
+
object_set(&ts->text, rstring);
|
1087
|
+
return get_rb_token_stream(ts);
|
721
1088
|
}
|
722
1089
|
|
723
1090
|
#define GET_LOWER(dflt) \
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
1091
|
+
bool lower;\
|
1092
|
+
VALUE rlower;\
|
1093
|
+
rb_scan_args(argc, argv, "01", &rlower);\
|
1094
|
+
lower = (argc ? RTEST(rlower) : dflt)
|
1095
|
+
|
1096
|
+
/*
|
1097
|
+
* call-seq:
|
1098
|
+
* AsciiWhiteSpaceAnalyzer.new(lower = true) -> analyzer
|
1099
|
+
*
|
1100
|
+
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1101
|
+
* but can optionally leave case as is. Lowercasing will only be done to
|
1102
|
+
* ascii characters.
|
1103
|
+
*
|
1104
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1105
|
+
*/
|
730
1106
|
static VALUE
|
731
1107
|
frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
732
1108
|
{
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
1109
|
+
Analyzer *a;
|
1110
|
+
GET_LOWER(false);
|
1111
|
+
a = whitespace_analyzer_new(lower);
|
1112
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1113
|
+
object_add(a, self);
|
1114
|
+
return self;
|
739
1115
|
}
|
740
1116
|
|
741
|
-
|
1117
|
+
/*
|
1118
|
+
* call-seq:
|
1119
|
+
* WhiteSpaceAnalyzer.new(lower = true) -> analyzer
|
1120
|
+
*
|
1121
|
+
* Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
|
1122
|
+
* optionally leave case as is. Lowercasing will be done based on the current
|
1123
|
+
* locale.
|
1124
|
+
*
|
1125
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1126
|
+
*/
|
742
1127
|
static VALUE
|
743
1128
|
frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
744
1129
|
{
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
1130
|
+
Analyzer *a;
|
1131
|
+
GET_LOWER(false);
|
1132
|
+
a = mb_whitespace_analyzer_new(lower);
|
1133
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1134
|
+
object_add(a, self);
|
1135
|
+
return self;
|
751
1136
|
}
|
752
1137
|
|
753
|
-
|
1138
|
+
/*
|
1139
|
+
* call-seq:
|
1140
|
+
* AsciiLetterAnalyzer.new(lower = true) -> analyzer
|
1141
|
+
*
|
1142
|
+
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1143
|
+
* but can optionally leave case as is. Lowercasing will only be done to
|
1144
|
+
* ascii characters.
|
1145
|
+
*
|
1146
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1147
|
+
*/
|
754
1148
|
static VALUE
|
755
1149
|
frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
756
1150
|
{
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
1151
|
+
Analyzer *a;
|
1152
|
+
GET_LOWER(true);
|
1153
|
+
a = letter_analyzer_new(lower);
|
1154
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1155
|
+
object_add(a, self);
|
1156
|
+
return self;
|
763
1157
|
}
|
764
1158
|
|
765
|
-
|
1159
|
+
/*
|
1160
|
+
* call-seq:
|
1161
|
+
* LetterAnalyzer.new(lower = true) -> analyzer
|
1162
|
+
*
|
1163
|
+
* Create a new LetterAnalyzer which downcases tokens by default but can
|
1164
|
+
* optionally leave case as is. Lowercasing will be done based on the current
|
1165
|
+
* locale.
|
1166
|
+
*
|
1167
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1168
|
+
*/
|
766
1169
|
static VALUE
|
767
1170
|
frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
768
1171
|
{
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
1172
|
+
Analyzer *a;
|
1173
|
+
GET_LOWER(true);
|
1174
|
+
a = mb_letter_analyzer_new(lower);
|
1175
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1176
|
+
object_add(a, self);
|
1177
|
+
return self;
|
775
1178
|
}
|
776
1179
|
|
777
1180
|
static VALUE
|
778
1181
|
get_rstopwords(const char **stop_words)
|
779
1182
|
{
|
780
|
-
|
781
|
-
|
1183
|
+
char **w = (char **)stop_words;
|
1184
|
+
VALUE rstopwords = rb_ary_new();
|
782
1185
|
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
1186
|
+
while (*w) {
|
1187
|
+
rb_ary_push(rstopwords, rb_str_new2(*w));
|
1188
|
+
w++;
|
1189
|
+
}
|
1190
|
+
return rstopwords;
|
788
1191
|
}
|
789
1192
|
|
790
|
-
|
1193
|
+
/*
|
1194
|
+
* call-seq:
|
1195
|
+
* AsciiStandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
|
1196
|
+
* -> analyzer
|
1197
|
+
*
|
1198
|
+
* Create a new AsciiStandardAnalyzer which downcases tokens by default but
|
1199
|
+
* can optionally leave case as is. Lowercasing will be done based on the
|
1200
|
+
* current locale. You can also set the list of stop-words to be used by the
|
1201
|
+
* StopFilter.
|
1202
|
+
*
|
1203
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1204
|
+
* stop_words:: list of stop-words to pass to the StopFilter
|
1205
|
+
*/
|
791
1206
|
static VALUE
|
792
1207
|
frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
793
1208
|
{
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
}
|
810
|
-
|
811
|
-
|
1209
|
+
bool lower;
|
1210
|
+
VALUE rlower, rstop_words;
|
1211
|
+
Analyzer *a;
|
1212
|
+
rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
|
1213
|
+
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
1214
|
+
if (rstop_words != Qnil) {
|
1215
|
+
char **stop_words = get_stopwords(rstop_words);
|
1216
|
+
a = standard_analyzer_new_with_words((const char **)stop_words, lower);
|
1217
|
+
free(stop_words);
|
1218
|
+
} else {
|
1219
|
+
a = standard_analyzer_new(lower);
|
1220
|
+
}
|
1221
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1222
|
+
object_add(a, self);
|
1223
|
+
return self;
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
/*
|
1227
|
+
* call-seq:
|
1228
|
+
* StandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
|
1229
|
+
* -> analyzer
|
1230
|
+
*
|
1231
|
+
* Create a new StandardAnalyzer which downcases tokens by default but can
|
1232
|
+
* optionally leave case as is. Lowercasing will be done based on the current
|
1233
|
+
* locale. You can also set the list of stop-words to be used by the
|
1234
|
+
* StopFilter.
|
1235
|
+
*
|
1236
|
+
* lower:: set to false if you don't want the field's tokens to be downcased
|
1237
|
+
* stop_words:: list of stop-words to pass to the StopFilter
|
1238
|
+
*/
|
812
1239
|
static VALUE
|
813
1240
|
frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
814
1241
|
{
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
1242
|
+
bool lower;
|
1243
|
+
VALUE rlower, rstop_words;
|
1244
|
+
Analyzer *a;
|
1245
|
+
rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
|
1246
|
+
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
1247
|
+
if (rstop_words != Qnil) {
|
1248
|
+
char **stop_words = get_stopwords(rstop_words);
|
1249
|
+
a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
|
1250
|
+
free(stop_words);
|
1251
|
+
} else {
|
1252
|
+
a = mb_standard_analyzer_new(lower);
|
1253
|
+
}
|
1254
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
1255
|
+
object_add(a, self);
|
1256
|
+
return self;
|
830
1257
|
}
|
831
1258
|
|
832
|
-
void
|
1259
|
+
static void
|
833
1260
|
frt_h_mark_values_i(void *key, void *value, void *arg)
|
834
1261
|
{
|
835
|
-
|
1262
|
+
frt_gc_mark(value);
|
836
1263
|
}
|
837
1264
|
|
838
|
-
void
|
1265
|
+
static void
|
839
1266
|
frt_pfa_mark(void *p)
|
840
1267
|
{
|
841
|
-
|
842
|
-
|
843
|
-
frt_gc_mark(pfa->def);
|
844
|
-
h_each(pfa->dict, &frt_h_mark_values_i, NULL);
|
1268
|
+
frt_gc_mark(PFA(p)->default_a);
|
1269
|
+
h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
|
845
1270
|
}
|
846
1271
|
|
847
1272
|
/*** PerFieldAnalyzer ***/
|
848
1273
|
|
1274
|
+
/*
|
1275
|
+
* call-seq:
|
1276
|
+
* PerFieldAnalyzer.new(default_analyzer) -> analyzer
|
1277
|
+
*
|
1278
|
+
* Create a new PerFieldAnalyzer specifying the default analyzer to use on
|
1279
|
+
* all fields that are set specifically.
|
1280
|
+
*
|
1281
|
+
* default_analyzer:: analyzer to be used on fields that aren't otherwise
|
1282
|
+
* specified
|
1283
|
+
*/
|
849
1284
|
static VALUE
|
850
1285
|
frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
851
1286
|
{
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
1287
|
+
Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
|
1288
|
+
Analyzer *a = per_field_analyzer_new(def);
|
1289
|
+
Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
|
1290
|
+
object_add(a, self);
|
1291
|
+
return self;
|
857
1292
|
}
|
858
1293
|
|
1294
|
+
/*
|
1295
|
+
* call-seq:
|
1296
|
+
* per_field_analyzer.add_field(field_name, default_analyzer) -> self
|
1297
|
+
* per_field_analyzer[field_name] = default_analyzer -> self
|
1298
|
+
*
|
1299
|
+
* Set the analyzer to be used on field +field_name+. Note that field_name
|
1300
|
+
* should be a symbol.
|
1301
|
+
*
|
1302
|
+
* field_name:: field we wish to set the analyzer for
|
1303
|
+
* analyzer:: analyzer to be used on +field_name+
|
1304
|
+
*/
|
859
1305
|
static VALUE
|
860
1306
|
frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
861
1307
|
{
|
862
|
-
|
863
|
-
|
864
|
-
|
1308
|
+
Analyzer *pfa, *a;
|
1309
|
+
Data_Get_Struct(self, Analyzer, pfa);
|
1310
|
+
a = frt_get_cwrapped_analyzer(ranalyzer);
|
865
1311
|
|
866
|
-
|
867
|
-
|
1312
|
+
pfa_add_field(pfa, StringValuePtr(rfield), a);
|
1313
|
+
return self;
|
868
1314
|
}
|
869
1315
|
|
870
1316
|
/*** RegExpAnalyzer ***/
|
@@ -872,36 +1318,46 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
872
1318
|
static void
|
873
1319
|
frt_re_analyzer_mark(Analyzer *a)
|
874
1320
|
{
|
875
|
-
|
1321
|
+
frt_gc_mark(a->current_ts);
|
876
1322
|
}
|
877
1323
|
|
878
1324
|
static void
|
879
|
-
|
1325
|
+
re_analyzer_destroy_i(Analyzer *a)
|
880
1326
|
{
|
881
|
-
|
882
|
-
|
1327
|
+
ts_deref(a->current_ts);
|
1328
|
+
free(a);
|
883
1329
|
}
|
884
1330
|
|
1331
|
+
/*
|
1332
|
+
* call-seq:
|
1333
|
+
* RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
|
1334
|
+
*
|
1335
|
+
* Create a new RegExpAnalyzer which will create tokenizers based on the
|
1336
|
+
* regular expression and lowercasing if required.
|
1337
|
+
*
|
1338
|
+
* reg_exp:: the token matcher for the tokenizer to use
|
1339
|
+
* lower:: set to false if you don't want to downcase the tokens
|
1340
|
+
*/
|
885
1341
|
static VALUE
|
886
1342
|
frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
887
1343
|
{
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
1344
|
+
VALUE lower, rets, regex, proc;
|
1345
|
+
Analyzer *a;
|
1346
|
+
TokenStream *ts;
|
1347
|
+
rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc);
|
892
1348
|
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
1349
|
+
ts = rets_new(Qnil, regex, proc);
|
1350
|
+
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
1351
|
+
REF(ts);
|
1352
|
+
/* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
|
1353
|
+
object_add(ts, rets);
|
898
1354
|
|
899
|
-
|
1355
|
+
if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
|
900
1356
|
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
1357
|
+
a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
|
1358
|
+
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
|
1359
|
+
object_add(a, self);
|
1360
|
+
return self;
|
905
1361
|
}
|
906
1362
|
|
907
1363
|
/****************************************************************************
|
@@ -912,265 +1368,818 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
912
1368
|
|
913
1369
|
static char *frt_locale = NULL;
|
914
1370
|
|
915
|
-
|
1371
|
+
/*
|
1372
|
+
* call-seq:
|
1373
|
+
* Ferret.locale -> locale_str
|
1374
|
+
*
|
1375
|
+
* Returns a string corresponding to the locale set. For example;
|
1376
|
+
*
|
1377
|
+
* puts Ferret.locale #=> "en_US.UTF-8"
|
1378
|
+
*/
|
1379
|
+
static VALUE frt_get_locale(VALUE self, VALUE locale)
|
916
1380
|
{
|
917
|
-
|
1381
|
+
return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
|
918
1382
|
}
|
919
1383
|
|
920
|
-
|
1384
|
+
/*
|
1385
|
+
* call-seq:
|
1386
|
+
* Ferret.locale = "en_US.UTF-8"
|
1387
|
+
*
|
1388
|
+
* Set the global locale. You should use this method to set different locales
|
1389
|
+
* when indexing documents with different encodings.
|
1390
|
+
*/
|
1391
|
+
static VALUE frt_set_locale(VALUE self, VALUE locale)
|
921
1392
|
{
|
922
|
-
|
923
|
-
|
924
|
-
|
1393
|
+
char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
|
1394
|
+
frt_locale = setlocale(LC_ALL, l);
|
1395
|
+
return frt_locale ? rb_str_new2(frt_locale) : Qnil;
|
925
1396
|
}
|
926
1397
|
|
927
1398
|
/****************************************************************************
|
928
1399
|
*
|
929
|
-
* Init
|
1400
|
+
* Init Functions
|
930
1401
|
*
|
931
1402
|
****************************************************************************/
|
932
1403
|
|
1404
|
+
/*
|
1405
|
+
* Document-class: Ferret::Analysis::Token
|
1406
|
+
*
|
1407
|
+
* == Summary
|
1408
|
+
*
|
1409
|
+
* A Token is an occurence of a term from the text of a field. It consists
|
1410
|
+
* of a term's text and the start and end offset of the term in the text of
|
1411
|
+
* the field;
|
1412
|
+
*
|
1413
|
+
* The start and end offsets permit applications to re-associate a token with
|
1414
|
+
* its source text, e.g., to display highlighted query terms in a document
|
1415
|
+
* browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
1416
|
+
* display, etc.
|
1417
|
+
*
|
1418
|
+
* === Attributes
|
1419
|
+
*
|
1420
|
+
* text:: the terms text which may have been modified by a Token Filter or
|
1421
|
+
* Tokenizer from the text originally found in the document
|
1422
|
+
* start:: is the position of the first character corresponding to
|
1423
|
+
* this token in the source text
|
1424
|
+
* end:: is equal to one greater than the position of the last
|
1425
|
+
* character corresponding of this token Note that the
|
1426
|
+
* difference between @end_offset and @start_offset may not be
|
1427
|
+
* equal to @text.length(), as the term text may have been
|
1428
|
+
* altered by a stemmer or some other filter.
|
1429
|
+
*/
|
1430
|
+
static void Init_Token(void)
|
1431
|
+
{
|
1432
|
+
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
1433
|
+
rb_define_alloc_func(cToken, frt_token_alloc);
|
1434
|
+
rb_include_module(cToken, rb_mComparable);
|
1435
|
+
|
1436
|
+
rb_define_method(cToken, "initialize", frt_token_init, -1);
|
1437
|
+
rb_define_method(cToken, "<=>", frt_token_cmp, 1);
|
1438
|
+
rb_define_method(cToken, "text", frt_token_get_text, 0);
|
1439
|
+
rb_define_method(cToken, "text=", frt_token_set_text, 1);
|
1440
|
+
rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
|
1441
|
+
rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
|
1442
|
+
rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
|
1443
|
+
rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
|
1444
|
+
rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
|
1445
|
+
rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
|
1446
|
+
rb_define_method(cToken, "to_s", frt_token_to_s, 0);
|
1447
|
+
}
|
1448
|
+
|
1449
|
+
/*
|
1450
|
+
* Document-class: Ferret::Analysis::TokenStream
|
1451
|
+
*
|
1452
|
+
* A TokenStream enumerates the sequence of tokens, either from
|
1453
|
+
* fields of a document or from query text.
|
1454
|
+
*
|
1455
|
+
* This is an abstract class. Concrete subclasses are:
|
1456
|
+
*
|
1457
|
+
* Tokenizer:: a TokenStream whose input is a string
|
1458
|
+
* TokenFilter:: a TokenStream whose input is another TokenStream
|
1459
|
+
*/
|
1460
|
+
static void Init_TokenStream(void)
|
1461
|
+
{
|
1462
|
+
cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1463
|
+
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
1464
|
+
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
1465
|
+
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
/*
|
1469
|
+
* Document-class: Ferret::Analysis::AsciiLetterTokenizer
|
1470
|
+
*
|
1471
|
+
* A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
|
1472
|
+
* That is to say, it defines tokens as maximal strings of adjacent letters,
|
1473
|
+
* as defined by the regular expression _/[A-Za-z]+/_.
|
1474
|
+
*
|
1475
|
+
* === Example
|
1476
|
+
*
|
1477
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1478
|
+
* => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
|
1479
|
+
*/
|
1480
|
+
static void Init_AsciiLetterTokenizer(void)
|
1481
|
+
{
|
1482
|
+
cAsciiLetterTokenizer =
|
1483
|
+
rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1484
|
+
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
1485
|
+
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1486
|
+
frt_a_letter_tokenizer_init, 1);
|
1487
|
+
}
|
1488
|
+
|
1489
|
+
/*
|
1490
|
+
* Document-class: Ferret::Analysis::LetterTokenizer
|
1491
|
+
*
|
1492
|
+
* A LetterTokenizer is a tokenizer that divides text at non-letters. That is
|
1493
|
+
* to say, it defines tokens as maximal strings of adjacent letters, as
|
1494
|
+
* defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
|
1495
|
+
* all characters in your local locale.
|
1496
|
+
*
|
1497
|
+
* === Example
|
1498
|
+
*
|
1499
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1500
|
+
* => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
|
1501
|
+
*/
|
1502
|
+
static void Init_LetterTokenizer(void)
|
1503
|
+
{
|
1504
|
+
cLetterTokenizer =
|
1505
|
+
rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1506
|
+
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
1507
|
+
rb_define_method(cLetterTokenizer, "initialize",
|
1508
|
+
frt_letter_tokenizer_init, -1);
|
1509
|
+
}
|
1510
|
+
|
1511
|
+
/*
|
1512
|
+
* Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
|
1513
|
+
*
|
1514
|
+
* A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
|
1515
|
+
* Adjacent sequences of non-WhiteSpace characters form tokens.
|
1516
|
+
*
|
1517
|
+
* === Example
|
1518
|
+
*
|
1519
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1520
|
+
* => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
|
1521
|
+
*/
|
1522
|
+
static void Init_AsciiWhiteSpaceTokenizer(void)
|
1523
|
+
{
|
1524
|
+
cAsciiWhiteSpaceTokenizer =
|
1525
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1526
|
+
cTokenStream);
|
1527
|
+
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
1528
|
+
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
1529
|
+
frt_a_whitespace_tokenizer_init, 1);
|
1530
|
+
}
|
1531
|
+
|
1532
|
+
/*
|
1533
|
+
* Document-class: Ferret::Analysis::WhiteSpaceTokenizer
|
1534
|
+
*
|
1535
|
+
* A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
|
1536
|
+
* Adjacent sequences of non-WhiteSpace characters form tokens.
|
1537
|
+
*
|
1538
|
+
* === Example
|
1539
|
+
*
|
1540
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1541
|
+
* => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
|
1542
|
+
*/
|
1543
|
+
static void Init_WhiteSpaceTokenizer(void)
|
1544
|
+
{
|
1545
|
+
cWhiteSpaceTokenizer =
|
1546
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1547
|
+
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
1548
|
+
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1549
|
+
frt_whitespace_tokenizer_init, -1);
|
1550
|
+
}
|
1551
|
+
|
1552
|
+
/*
|
1553
|
+
* Document-class: Ferret::Analysis::AsciiStandardTokenizer
|
1554
|
+
*
|
1555
|
+
* The standard tokenizer is an advanced tokenizer which tokenizes most
|
1556
|
+
* words correctly as well as tokenizing things like email addresses, web
|
1557
|
+
* addresses, phone numbers, etc.
|
1558
|
+
*
|
1559
|
+
* === Example
|
1560
|
+
*
|
1561
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1562
|
+
* => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
|
1563
|
+
*/
|
1564
|
+
static void Init_AsciiStandardTokenizer(void)
|
1565
|
+
{
|
1566
|
+
cAsciiStandardTokenizer =
|
1567
|
+
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1568
|
+
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1569
|
+
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1570
|
+
frt_a_standard_tokenizer_init, 1);
|
1571
|
+
}
|
1572
|
+
|
1573
|
+
/*
|
1574
|
+
* Document-class: Ferret::Analysis::StandardTokenizer
|
1575
|
+
*
|
1576
|
+
* The standard tokenizer is an advanced tokenizer which tokenizes most
|
1577
|
+
* words correctly as well as tokenizing things like email addresses, web
|
1578
|
+
* addresses, phone numbers, etc.
|
1579
|
+
*
|
1580
|
+
* === Example
|
1581
|
+
*
|
1582
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1583
|
+
* => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
|
1584
|
+
*/
|
1585
|
+
static void Init_StandardTokenizer(void)
|
1586
|
+
{
|
1587
|
+
cStandardTokenizer =
|
1588
|
+
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1589
|
+
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1590
|
+
rb_define_method(cStandardTokenizer, "initialize",
|
1591
|
+
frt_standard_tokenizer_init, 1);
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
/*
|
1595
|
+
* Document-class: Ferret::Analysis::RegExpTokenizer
|
1596
|
+
*
|
1597
|
+
* A tokenizer that recognizes tokens based on a regular expression passed to
|
1598
|
+
* the contructor. Most possible tokenizers can be created using this class.
|
1599
|
+
*
|
1600
|
+
* === Example
|
1601
|
+
*
|
1602
|
+
* Below is an example of a simple implementation of a LetterTokenizer using
|
1603
|
+
* an RegExpTokenizer. Basically, a token is a sequence of alphabetic
|
1604
|
+
* characters separated by one or more non-alphabetic characters.
|
1605
|
+
*
|
1606
|
+
* # of course you would add more than just é
|
1607
|
+
* RegExpTokenizer.new(input, /[[:alpha:]é]+/)
|
1608
|
+
*
|
1609
|
+
* "Dave's résumé, at http://www.davebalmain.com/ 1234"
|
1610
|
+
* => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
|
1611
|
+
*/
|
1612
|
+
static void Init_RegExpTokenizer(void)
|
1613
|
+
{
|
1614
|
+
cRegExpTokenizer =
|
1615
|
+
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1616
|
+
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1617
|
+
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1618
|
+
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
1619
|
+
rb_define_method(cRegExpTokenizer, "initialize",
|
1620
|
+
frt_rets_init, -1);
|
1621
|
+
rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
|
1622
|
+
rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
|
1623
|
+
}
|
1624
|
+
|
1625
|
+
/***************/
|
1626
|
+
/*** Filters ***/
|
1627
|
+
/***************/
|
1628
|
+
|
1629
|
+
/*
|
1630
|
+
* Document-class: Ferret::Analysis::AsciiLowerCaseFilter
|
1631
|
+
*
|
1632
|
+
* AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
|
1633
|
+
* Ascii characters. For other characters use LowerCaseFilter.
|
1634
|
+
*
|
1635
|
+
* === Example
|
1636
|
+
*
|
1637
|
+
* ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
|
1638
|
+
*
|
1639
|
+
*/
|
1640
|
+
static void Init_AsciiLowerCaseFilter(void)
|
1641
|
+
{
|
1642
|
+
cAsciiLowerCaseFilter =
|
1643
|
+
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1644
|
+
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1645
|
+
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1646
|
+
frt_a_lowercase_filter_init, 1);
|
1647
|
+
}
|
1648
|
+
|
1649
|
+
/*
|
1650
|
+
* Document-class: Ferret::Analysis::LowerCaseFilter
|
1651
|
+
*
|
1652
|
+
* LowerCaseFilter normalizes a token's text to lowercase based on the
|
1653
|
+
* current locale.
|
1654
|
+
*
|
1655
|
+
* === Example
|
1656
|
+
*
|
1657
|
+
* ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
|
1658
|
+
*
|
1659
|
+
*/
|
1660
|
+
static void Init_LowerCaseFilter(void)
|
1661
|
+
{
|
1662
|
+
cLowerCaseFilter =
|
1663
|
+
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1664
|
+
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1665
|
+
rb_define_method(cLowerCaseFilter, "initialize",
|
1666
|
+
frt_lowercase_filter_init, 1);
|
1667
|
+
}
|
1668
|
+
|
1669
|
+
/*
|
1670
|
+
* Document-class: Ferret::Analysis::StopFilter
|
1671
|
+
*
|
1672
|
+
* A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
|
1673
|
+
* that you don't wish to be index. Usually they will be common words like
|
1674
|
+
* "the" and "and" although you can specify whichever words you want.
|
1675
|
+
*
|
1676
|
+
* === Example
|
1677
|
+
*
|
1678
|
+
* ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
|
1679
|
+
*/
|
1680
|
+
static void Init_StopFilter(void)
|
1681
|
+
{
|
1682
|
+
cStopFilter =
|
1683
|
+
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1684
|
+
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1685
|
+
rb_define_method(cStopFilter, "initialize",
|
1686
|
+
frt_stop_filter_init, -1);
|
1687
|
+
}
|
1688
|
+
|
1689
|
+
/*
|
1690
|
+
* Document-class: Ferret::Analysis::StemFilter
|
1691
|
+
*
|
1692
|
+
* == Summary
|
1693
|
+
*
|
1694
|
+
* A StemFilter takes a term and transforms the term as per the SnowBall
|
1695
|
+
* stemming algorithm. Note: the input to the stemming filter must already
|
1696
|
+
* be in lower case, so you will need to use LowerCaseFilter or
|
1697
|
+
* LowerCaseTokenizer further down the Tokenizer chain in order for this to
|
1698
|
+
* work properly!
|
1699
|
+
*
|
1700
|
+
* To use this filter with other analyzers, you'll want to write an Analyzer
|
1701
|
+
* class that sets up the TokenStream chain as you want it. To use this with
|
1702
|
+
* LowerCaseTokenizer, for example, you'd write an analyzer like this:
|
1703
|
+
*
|
1704
|
+
* === Available algorithms and encodings
|
1705
|
+
*
|
1706
|
+
* Algorithm Algorithm Pseudonyms Encoding
|
1707
|
+
* ----------------------------------------------------------------
|
1708
|
+
* "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
|
1709
|
+
* "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
|
1710
|
+
* "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
|
1711
|
+
* "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
|
1712
|
+
* "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
|
1713
|
+
* "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
|
1714
|
+
* "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
|
1715
|
+
* "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
|
1716
|
+
* "porter", | | "ISO_8859_1", "UTF_8"
|
1717
|
+
* "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
|
1718
|
+
* "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
|
1719
|
+
* "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
|
1720
|
+
* "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
|
1721
|
+
*
|
1722
|
+
* === Example
|
1723
|
+
*
|
1724
|
+
* def MyAnalyzer < Analyzer
|
1725
|
+
* def token_stream(field, str)
|
1726
|
+
* return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
|
1727
|
+
* end
|
1728
|
+
* end
|
1729
|
+
*
|
1730
|
+
* "debate debates debated debating debater"
|
1731
|
+
* => ["debat", "debat", "debat", "debat", "debat"]
|
1732
|
+
*
|
1733
|
+
* === Attributes
|
1734
|
+
*
|
1735
|
+
* token_stream:: TokenStream to be filtered
|
1736
|
+
* algorithm:: The algorithm (or language) to use (default: "english")
|
1737
|
+
* encoding:: The encoding of the data (default: "UTF-8")
|
1738
|
+
*/
|
1739
|
+
static void Init_StemFilter(void)
|
1740
|
+
{
|
1741
|
+
cStemFilter =
|
1742
|
+
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1743
|
+
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1744
|
+
rb_define_method(cStemFilter, "initialize",
|
1745
|
+
frt_stem_filter_init, -1);
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
/*************************/
|
1749
|
+
/*** * * Analyzers * * ***/
|
1750
|
+
/*************************/
|
1751
|
+
|
1752
|
+
/*
|
1753
|
+
* Document-class: Ferret::Analysis::Analyzer
|
1754
|
+
*
|
1755
|
+
* == Summary
|
1756
|
+
*
|
1757
|
+
* An Analyzer builds TokenStreams, which analyze text. It thus represents
|
1758
|
+
* a policy for extracting index terms from text.
|
1759
|
+
*
|
1760
|
+
* Typical implementations first build a Tokenizer, which breaks the stream
|
1761
|
+
* of characters from the Reader into raw Tokens. One or more TokenFilter s
|
1762
|
+
* may then be applied to the output of the Tokenizer.
|
1763
|
+
*
|
1764
|
+
* The default Analyzer just creates a LowerCaseTokenizer which converts
|
1765
|
+
* all text to lowercase tokens. See LowerCaseTokenizer for more details.
|
1766
|
+
*
|
1767
|
+
* === Example
|
1768
|
+
*
|
1769
|
+
* To create your own custom Analyzer you simply need to implement a
|
1770
|
+
* token_stream method which takes the field name and the data to be
|
1771
|
+
* tokenized as parameters and returns a TokenStream. Most analyzers
|
1772
|
+
* typically ignore the field name.
|
1773
|
+
*
|
1774
|
+
* Here we'll create a StemmingAnalyzer;
|
1775
|
+
*
|
1776
|
+
* def MyAnalyzer < Analyzer
|
1777
|
+
* def token_stream(field, str)
|
1778
|
+
* return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
|
1779
|
+
* end
|
1780
|
+
* end
|
1781
|
+
*/
|
1782
|
+
static void Init_Analyzer(void)
|
1783
|
+
{
|
1784
|
+
cAnalyzer =
|
1785
|
+
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1786
|
+
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1787
|
+
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1788
|
+
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
1789
|
+
}
|
1790
|
+
|
1791
|
+
/*
|
1792
|
+
* Document-class: Ferret::Analysis::AsciiLetterAnalyzer
|
1793
|
+
*
|
1794
|
+
* == Summary
|
1795
|
+
*
|
1796
|
+
* An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
|
1797
|
+
* maximal strings of Ascii characters. If implemented in Ruby it would look
|
1798
|
+
* like;
|
1799
|
+
*
|
1800
|
+
* class AsciiLetterAnalyzer
|
1801
|
+
* def initialize(lower = true)
|
1802
|
+
* @lower = lower
|
1803
|
+
* end
|
1804
|
+
*
|
1805
|
+
* def token_stream(field, str)
|
1806
|
+
* if @lower
|
1807
|
+
* return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
|
1808
|
+
* else
|
1809
|
+
* return AsciiLetterTokenizer.new(str)
|
1810
|
+
* end
|
1811
|
+
* end
|
1812
|
+
* end
|
1813
|
+
*
|
1814
|
+
* As you can see it makes use of the AsciiLetterTokenizer and
|
1815
|
+
* AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
|
1816
|
+
* characters so you should use the LetterAnalyzer is you want to analyze
|
1817
|
+
* multi-byte data like "UTF-8".
|
1818
|
+
*/
|
1819
|
+
static void Init_AsciiLetterAnalyzer(void)
|
1820
|
+
{
|
1821
|
+
cAsciiLetterAnalyzer =
|
1822
|
+
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1823
|
+
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1824
|
+
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1825
|
+
frt_a_letter_analyzer_init, -1);
|
1826
|
+
}
|
1827
|
+
|
1828
|
+
/*
|
1829
|
+
* Document-class: Ferret::Analysis::LetterAnalyzer
|
1830
|
+
*
|
1831
|
+
* == Summary
|
1832
|
+
*
|
1833
|
+
* A LetterAnalyzer creates a TokenStream that splits the input up into
|
1834
|
+
* maximal strings of characters as recognized by the current locale. If
|
1835
|
+
* implemented in Ruby it would look like;
|
1836
|
+
*
|
1837
|
+
* class LetterAnalyzer
|
1838
|
+
* def initialize(lower = true)
|
1839
|
+
* @lower = lower
|
1840
|
+
* end
|
1841
|
+
*
|
1842
|
+
* def token_stream(field, str)
|
1843
|
+
* return LetterTokenizer.new(str, @lower)
|
1844
|
+
* end
|
1845
|
+
* end
|
1846
|
+
*
|
1847
|
+
* As you can see it makes use of the LetterTokenizer.
|
1848
|
+
*/
|
1849
|
+
static void Init_LetterAnalyzer(void)
|
1850
|
+
{
|
1851
|
+
cLetterAnalyzer =
|
1852
|
+
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1853
|
+
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1854
|
+
rb_define_method(cLetterAnalyzer, "initialize",
|
1855
|
+
frt_letter_analyzer_init, -1);
|
1856
|
+
}
|
1857
|
+
|
1858
|
+
/*
|
1859
|
+
* Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
|
1860
|
+
*
|
1861
|
+
* == Summary
|
1862
|
+
*
|
1863
|
+
* The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
|
1864
|
+
* non-whitespace characters. If implemented in Ruby the
|
1865
|
+
* AsciiWhiteSpaceAnalyzer would look like;
|
1866
|
+
*
|
1867
|
+
* class AsciiWhiteSpaceAnalyzer
|
1868
|
+
* def initialize(lower = true)
|
1869
|
+
* @lower = lower
|
1870
|
+
* end
|
1871
|
+
*
|
1872
|
+
* def token_stream(field, str)
|
1873
|
+
* if @lower
|
1874
|
+
* return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
|
1875
|
+
* else
|
1876
|
+
* return AsciiWhiteSpaceTokenizer.new(str)
|
1877
|
+
* end
|
1878
|
+
* end
|
1879
|
+
* end
|
1880
|
+
*
|
1881
|
+
* As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
|
1882
|
+
* use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
|
1883
|
+
* as "UTF-8".
|
1884
|
+
*/
|
1885
|
+
static void Init_AsciiWhiteSpaceAnalyzer(void)
|
1886
|
+
{
|
1887
|
+
cAsciiWhiteSpaceAnalyzer =
|
1888
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1889
|
+
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1890
|
+
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1891
|
+
frt_a_white_space_analyzer_init, -1);
|
1892
|
+
}
|
1893
|
+
|
1894
|
+
/*
|
1895
|
+
* Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
|
1896
|
+
*
|
1897
|
+
* == Summary
|
1898
|
+
*
|
1899
|
+
* The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
|
1900
|
+
* non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
|
1901
|
+
* would look like;
|
1902
|
+
*
|
1903
|
+
* class WhiteSpaceAnalyzer
|
1904
|
+
* def initialize(lower = true)
|
1905
|
+
* @lower = lower
|
1906
|
+
* end
|
1907
|
+
*
|
1908
|
+
* def token_stream(field, str)
|
1909
|
+
* return WhiteSpaceTokenizer.new(str, @lower)
|
1910
|
+
* end
|
1911
|
+
* end
|
1912
|
+
*
|
1913
|
+
* As you can see it makes use of the WhiteSpaceTokenizer.
|
1914
|
+
*/
|
1915
|
+
static void Init_WhiteSpaceAnalyzer(void)
|
1916
|
+
{
|
1917
|
+
cWhiteSpaceAnalyzer =
|
1918
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1919
|
+
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1920
|
+
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1921
|
+
frt_white_space_analyzer_init, -1);
|
1922
|
+
}
|
1923
|
+
|
1924
|
+
/*
|
1925
|
+
* Document-class: Ferret::Analysis::AsciiStandardAnalyzer
|
1926
|
+
*
|
1927
|
+
* == Summary
|
1928
|
+
*
|
1929
|
+
* The AsciiStandardAnalyzer is the most advanced of the available
|
1930
|
+
* ascii-analyzers. If it were implemented in Ruby it would look like this;
|
1931
|
+
*
|
1932
|
+
* class AsciiStandardAnalyzer
|
1933
|
+
* def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
|
1934
|
+
* @lower = lower
|
1935
|
+
* @stop_words = stop_words
|
1936
|
+
* end
|
1937
|
+
*
|
1938
|
+
* def token_stream(field, str)
|
1939
|
+
* if @lower
|
1940
|
+
* return StopFilter.new(AsciiLowerCaseFilter.new(
|
1941
|
+
* AsciiStandardTokenizer.new(str)), @stop_words)
|
1942
|
+
* else
|
1943
|
+
* return StopFilter.new(AsciiStandardTokenizer.new(str), @stop_words)
|
1944
|
+
* end
|
1945
|
+
* end
|
1946
|
+
* end
|
1947
|
+
*
|
1948
|
+
* As you can see it makes use of the AsciiStandardTokenizer and you can also
|
1949
|
+
* add your own list of stop-words if you wish. Note that this tokenizer
|
1950
|
+
* won't recognize non-ascii characters so you should use the
|
1951
|
+
* StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
|
1952
|
+
*/
|
1953
|
+
static void Init_AsciiStandardAnalyzer(void)
|
1954
|
+
{
|
1955
|
+
cAsciiStandardAnalyzer =
|
1956
|
+
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
1957
|
+
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
1958
|
+
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
1959
|
+
frt_a_standard_analyzer_init, -1);
|
1960
|
+
}
|
1961
|
+
|
1962
|
+
/*
|
1963
|
+
* Document-class: Ferret::Analysis::StandardAnalyzer
|
1964
|
+
*
|
1965
|
+
* == Summary
|
1966
|
+
*
|
1967
|
+
* The StandardAnalyzer is the most advanced of the available analyzers. If
|
1968
|
+
* it were implemented in Ruby it would look like this;
|
1969
|
+
*
|
1970
|
+
* class StandardAnalyzer
|
1971
|
+
* def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
|
1972
|
+
* @lower = lower
|
1973
|
+
* @stop_words = stop_words
|
1974
|
+
* end
|
1975
|
+
*
|
1976
|
+
* def token_stream(field, str)
|
1977
|
+
* return StopFilter.new(StandardTokenizer.new(str, @lower), @stop_words)
|
1978
|
+
* end
|
1979
|
+
* end
|
1980
|
+
*
|
1981
|
+
* As you can see it makes use of the StandardTokenizer and you can also add
|
1982
|
+
* your own list of stopwords if you wish.
|
1983
|
+
*/
|
1984
|
+
static void Init_StandardAnalyzer(void)
|
1985
|
+
{
|
1986
|
+
cStandardAnalyzer =
|
1987
|
+
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
1988
|
+
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
1989
|
+
rb_define_method(cStandardAnalyzer, "initialize",
|
1990
|
+
frt_standard_analyzer_init, -1);
|
1991
|
+
}
|
1992
|
+
|
1993
|
+
/*
|
1994
|
+
* Document-class: Ferret::Analysis::PerFieldAnalyzer
|
1995
|
+
*
|
1996
|
+
* == Summary
|
1997
|
+
*
|
1998
|
+
* The PerFieldAnalyzer is for use when you want to analyze different fields
|
1999
|
+
* with different analyzers. With the PerFieldAnalyzer you can specify how
|
2000
|
+
* you want each field analyzed.
|
2001
|
+
*
|
2002
|
+
* === Example
|
2003
|
+
*
|
2004
|
+
* # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
|
2005
|
+
* pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
|
2006
|
+
*
|
2007
|
+
* # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
|
2008
|
+
* pfa[:title] = WhiteSpaceAnalyzer.new(false)
|
2009
|
+
*
|
2010
|
+
* # Use a custom analyzer on the :created_at field
|
2011
|
+
* pfa[:created_at] = DateAnalyzer.new
|
2012
|
+
*/
|
2013
|
+
static void Init_PerFieldAnalyzer(void)
|
2014
|
+
{
|
2015
|
+
cPerFieldAnalyzer =
|
2016
|
+
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2017
|
+
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
2018
|
+
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2019
|
+
frt_per_field_analyzer_init, 1);
|
2020
|
+
rb_define_method(cPerFieldAnalyzer, "add_field",
|
2021
|
+
frt_per_field_analyzer_add_field, 2);
|
2022
|
+
rb_define_method(cPerFieldAnalyzer, "[]=",
|
2023
|
+
frt_per_field_analyzer_add_field, 2);
|
2024
|
+
}
|
2025
|
+
|
2026
|
+
/*
|
2027
|
+
* Document-class: Ferret::Analysis::RegExpAnalyzer
|
2028
|
+
*
|
2029
|
+
* == Summary
|
2030
|
+
*
|
2031
|
+
* Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
|
2032
|
+
* implemented in Ruby it would look like this;
|
2033
|
+
*
|
2034
|
+
* class RegExpAnalyzer
|
2035
|
+
* def initialize(reg_exp, lower = true)
|
2036
|
+
* @lower = lower
|
2037
|
+
* @reg_exp = reg_exp
|
2038
|
+
* end
|
2039
|
+
*
|
2040
|
+
* def token_stream(field, str)
|
2041
|
+
* if @lower
|
2042
|
+
* return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
|
2043
|
+
* else
|
2044
|
+
* return RegExpTokenizer.new(str, reg_exp)
|
2045
|
+
* end
|
2046
|
+
* end
|
2047
|
+
* end
|
2048
|
+
*
|
2049
|
+
* === Example
|
2050
|
+
*
|
2051
|
+
* csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
|
2052
|
+
*/
|
2053
|
+
static void Init_RegExpAnalyzer(void)
|
2054
|
+
{
|
2055
|
+
cRegExpAnalyzer =
|
2056
|
+
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2057
|
+
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2058
|
+
rb_define_method(cRegExpAnalyzer, "initialize",
|
2059
|
+
frt_re_analyzer_init, -1);
|
2060
|
+
}
|
2061
|
+
|
2062
|
+
/* rdoc hack
|
2063
|
+
extern VALUE mFerret = rb_define_module("Ferret");
|
2064
|
+
*/
|
2065
|
+
|
2066
|
+
/*
|
2067
|
+
* Document-module: Ferret::Analysis
|
2068
|
+
*
|
2069
|
+
* == Summary
|
2070
|
+
*
|
2071
|
+
* The Analysis module contains all the classes used to analyze and tokenize
|
2072
|
+
* the data to be indexed. There are three main classes you need to know
|
2073
|
+
* about when dealing with analysis; Analyzer, TokenStream and Token.
|
2074
|
+
*
|
2075
|
+
* == Classes
|
2076
|
+
*
|
2077
|
+
* === Analyzer
|
2078
|
+
*
|
2079
|
+
* Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
|
2080
|
+
* indexing class when you create it and it will create the TokenStreams
|
2081
|
+
* necessary to tokenize the fields in the documents. Most of the time you
|
2082
|
+
* won't need to worry about TokenStreams and Tokens, one of the Analyzers
|
2083
|
+
* distributed with Ferret will do exactly what you need. Otherwise you'll
|
2084
|
+
* need to implement a custom analyzer.
|
2085
|
+
*
|
2086
|
+
* === TokenStream
|
2087
|
+
*
|
2088
|
+
* A TokenStream is an enumeration of Tokens. There are two standard types of
|
2089
|
+
* TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
|
2090
|
+
* turns it into a list of Tokens. A TokenFilter takes another TokenStream
|
2091
|
+
* and post-processes the Tokens. You can chain as many TokenFilters together
|
2092
|
+
* as you like but they always need to finish with a Tokenizer.
|
2093
|
+
*
|
2094
|
+
* === Token
|
2095
|
+
*
|
2096
|
+
* A Token is a single term from a document field. A token contains the text
|
2097
|
+
* representing the term as well as the start and end offset of the token.
|
2098
|
+
* The start and end offset will represent the token as it appears in the
|
2099
|
+
* source field. Some TokenFilters may change the text in the Token but the
|
2100
|
+
* start and end offsets should stay the same so (end - start) won't
|
2101
|
+
* necessarily be equal to the length of text in the token. For example using
|
2102
|
+
* a stemming TokenFilter the term "Beginning" might have start and end
|
2103
|
+
* offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
|
2104
|
+
* might be "begin" (after stemming).
|
2105
|
+
*/
|
933
2106
|
void
|
934
|
-
|
935
|
-
{
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1012
|
-
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1013
|
-
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1014
|
-
frt_a_standard_tokenizer_init, 1);
|
1015
|
-
|
1016
|
-
/*** * * StandardTokenizer * * ***/
|
1017
|
-
cStandardTokenizer =
|
1018
|
-
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1019
|
-
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1020
|
-
rb_define_method(cStandardTokenizer, "initialize",
|
1021
|
-
frt_standard_tokenizer_init, 1);
|
1022
|
-
|
1023
|
-
/*** * * RegExpTokenizer * * ***/
|
1024
|
-
cRegExpTokenizer =
|
1025
|
-
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1026
|
-
rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
|
1027
|
-
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1028
|
-
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
1029
|
-
rb_define_method(cRegExpTokenizer, "initialize",
|
1030
|
-
frt_rets_init, -1);
|
1031
|
-
rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
|
1032
|
-
rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
|
1033
|
-
rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
|
1034
|
-
|
1035
|
-
/***************/
|
1036
|
-
/*** Filters ***/
|
1037
|
-
/***************/
|
1038
|
-
rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
|
1039
|
-
get_rstopwords(ENGLISH_STOP_WORDS));
|
1040
|
-
rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
|
1041
|
-
get_rstopwords(FULL_ENGLISH_STOP_WORDS));
|
1042
|
-
rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
|
1043
|
-
get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
|
1044
|
-
rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
|
1045
|
-
get_rstopwords(FULL_FRENCH_STOP_WORDS));
|
1046
|
-
rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
|
1047
|
-
get_rstopwords(FULL_SPANISH_STOP_WORDS));
|
1048
|
-
rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
|
1049
|
-
get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
|
1050
|
-
rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
|
1051
|
-
get_rstopwords(FULL_ITALIAN_STOP_WORDS));
|
1052
|
-
rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
|
1053
|
-
get_rstopwords(FULL_GERMAN_STOP_WORDS));
|
1054
|
-
rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
|
1055
|
-
get_rstopwords(FULL_DUTCH_STOP_WORDS));
|
1056
|
-
rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
|
1057
|
-
get_rstopwords(FULL_SWEDISH_STOP_WORDS));
|
1058
|
-
rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
|
1059
|
-
get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
|
1060
|
-
rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
|
1061
|
-
get_rstopwords(FULL_DANISH_STOP_WORDS));
|
1062
|
-
rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
|
1063
|
-
get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
|
1064
|
-
rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
|
1065
|
-
get_rstopwords(FULL_FINNISH_STOP_WORDS));
|
1066
|
-
|
1067
|
-
cAsciiLowerCaseFilter =
|
1068
|
-
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1069
|
-
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1070
|
-
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1071
|
-
frt_a_lowercase_filter_init, 1);
|
1072
|
-
|
1073
|
-
cLowerCaseFilter =
|
1074
|
-
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1075
|
-
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1076
|
-
rb_define_method(cLowerCaseFilter, "initialize",
|
1077
|
-
frt_lowercase_filter_init, 1);
|
1078
|
-
|
1079
|
-
cStopFilter =
|
1080
|
-
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1081
|
-
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1082
|
-
rb_define_method(cStopFilter, "initialize",
|
1083
|
-
frt_stop_filter_init, -1);
|
1084
|
-
|
1085
|
-
cStemFilter =
|
1086
|
-
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1087
|
-
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1088
|
-
rb_define_method(cStemFilter, "initialize",
|
1089
|
-
frt_stem_filter_init, -1);
|
1090
|
-
|
1091
|
-
|
1092
|
-
/*************************/
|
1093
|
-
/*** * * Analyzers * * ***/
|
1094
|
-
/*************************/
|
1095
|
-
|
1096
|
-
/*** * * Analyzer * * ***/
|
1097
|
-
cAnalyzer =
|
1098
|
-
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1099
|
-
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1100
|
-
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1101
|
-
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
1102
|
-
|
1103
|
-
/*** * * AsciiLetterAnalyzer * * ***/
|
1104
|
-
cAsciiLetterAnalyzer =
|
1105
|
-
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1106
|
-
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1107
|
-
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1108
|
-
frt_a_letter_analyzer_init, -1);
|
1109
|
-
|
1110
|
-
/*** * * LetterAnalyzer * * ***/
|
1111
|
-
cLetterAnalyzer =
|
1112
|
-
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1113
|
-
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1114
|
-
rb_define_method(cLetterAnalyzer, "initialize",
|
1115
|
-
frt_letter_analyzer_init, -1);
|
1116
|
-
|
1117
|
-
/*** * * AsciiWhiteSpaceAnalyzer * * ***/
|
1118
|
-
cAsciiWhiteSpaceAnalyzer =
|
1119
|
-
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1120
|
-
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1121
|
-
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1122
|
-
frt_a_white_space_analyzer_init, -1);
|
1123
|
-
|
1124
|
-
/*** * * WhiteSpaceAnalyzer * * ***/
|
1125
|
-
cWhiteSpaceAnalyzer =
|
1126
|
-
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1127
|
-
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1128
|
-
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1129
|
-
frt_white_space_analyzer_init, -1);
|
1130
|
-
|
1131
|
-
/*** * * AsciiStandardAnalyzer * * ***/
|
1132
|
-
cAsciiStandardAnalyzer =
|
1133
|
-
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
1134
|
-
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
1135
|
-
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
1136
|
-
frt_a_standard_analyzer_init, -1);
|
1137
|
-
|
1138
|
-
/*** * * StandardAnalyzer * * ***/
|
1139
|
-
cStandardAnalyzer =
|
1140
|
-
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
1141
|
-
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
1142
|
-
rb_define_method(cStandardAnalyzer, "initialize",
|
1143
|
-
frt_standard_analyzer_init, -1);
|
1144
|
-
|
1145
|
-
/*** * * PerFieldAnalyzer * * ***/
|
1146
|
-
cPerFieldAnalyzer =
|
1147
|
-
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
1148
|
-
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
1149
|
-
rb_define_method(cPerFieldAnalyzer, "initialize",
|
1150
|
-
frt_per_field_analyzer_init, 1);
|
1151
|
-
rb_define_method(cPerFieldAnalyzer, "add_field",
|
1152
|
-
frt_per_field_analyzer_add_field, 2);
|
1153
|
-
rb_define_method(cPerFieldAnalyzer, "[]=",
|
1154
|
-
frt_per_field_analyzer_add_field, 2);
|
1155
|
-
rb_define_class_under(mAnalysis, "PerFieldAnalyzerWrapper", cPerFieldAnalyzer);
|
1156
|
-
|
1157
|
-
/*** * * RegexAnalyzer * * ***/
|
1158
|
-
cRegExpAnalyzer =
|
1159
|
-
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
1160
|
-
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
1161
|
-
rb_define_method(cRegExpAnalyzer, "initialize",
|
1162
|
-
frt_re_analyzer_init, -1);
|
1163
|
-
|
1164
|
-
/*
|
1165
|
-
cRegexAnalyzer =
|
1166
|
-
rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
|
1167
|
-
rb_define_alloc_func(cRegexAnalyzer, frt_data_alloc);
|
1168
|
-
rb_define_method(cRegexAnalyzer, "initialize",
|
1169
|
-
frt_regex_analyzer_init, 0);
|
1170
|
-
rb_define_method(cRegexAnalyzer, "token_stream",
|
1171
|
-
frt_regex_analyzer_token_stream, 2);
|
1172
|
-
rb_define_method(cRegexAnalyzer, "setlocale",
|
1173
|
-
frt_regex_analyzer_setlocale, 1);
|
1174
|
-
*/
|
2107
|
+
Init_Analysis(void)
|
2108
|
+
{
|
2109
|
+
mAnalysis = rb_define_module_under(mFerret, "Analysis");
|
2110
|
+
|
2111
|
+
/* TokenStream Methods */
|
2112
|
+
id_next = rb_intern("next");
|
2113
|
+
id_reset = rb_intern("text=");
|
2114
|
+
id_clone = rb_intern("clone");
|
2115
|
+
|
2116
|
+
/* Analyzer Methods */
|
2117
|
+
id_token_stream = rb_intern("token_stream");
|
2118
|
+
|
2119
|
+
object_space = rb_hash_new();
|
2120
|
+
rb_define_const(mFerret, "OBJECT_SPACE", object_space);
|
2121
|
+
|
2122
|
+
/*** * * Locale stuff * * ***/
|
2123
|
+
frt_locale = setlocale(LC_ALL, "");
|
2124
|
+
rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
|
2125
|
+
rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
|
2126
|
+
|
2127
|
+
rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
|
2128
|
+
get_rstopwords(ENGLISH_STOP_WORDS));
|
2129
|
+
rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
|
2130
|
+
get_rstopwords(FULL_ENGLISH_STOP_WORDS));
|
2131
|
+
rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
|
2132
|
+
get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
|
2133
|
+
rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
|
2134
|
+
get_rstopwords(FULL_FRENCH_STOP_WORDS));
|
2135
|
+
rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
|
2136
|
+
get_rstopwords(FULL_SPANISH_STOP_WORDS));
|
2137
|
+
rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
|
2138
|
+
get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
|
2139
|
+
rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
|
2140
|
+
get_rstopwords(FULL_ITALIAN_STOP_WORDS));
|
2141
|
+
rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
|
2142
|
+
get_rstopwords(FULL_GERMAN_STOP_WORDS));
|
2143
|
+
rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
|
2144
|
+
get_rstopwords(FULL_DUTCH_STOP_WORDS));
|
2145
|
+
rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
|
2146
|
+
get_rstopwords(FULL_SWEDISH_STOP_WORDS));
|
2147
|
+
rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
|
2148
|
+
get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
|
2149
|
+
rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
|
2150
|
+
get_rstopwords(FULL_DANISH_STOP_WORDS));
|
2151
|
+
rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
|
2152
|
+
get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
|
2153
|
+
rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
|
2154
|
+
get_rstopwords(FULL_FINNISH_STOP_WORDS));
|
2155
|
+
|
2156
|
+
Init_Token();
|
2157
|
+
Init_TokenStream();
|
2158
|
+
|
2159
|
+
Init_AsciiLetterTokenizer();
|
2160
|
+
Init_LetterTokenizer();
|
2161
|
+
|
2162
|
+
Init_AsciiWhiteSpaceTokenizer();
|
2163
|
+
Init_WhiteSpaceTokenizer();
|
2164
|
+
|
2165
|
+
Init_AsciiStandardTokenizer();
|
2166
|
+
Init_StandardTokenizer();
|
2167
|
+
|
2168
|
+
Init_RegExpTokenizer();
|
2169
|
+
|
2170
|
+
Init_AsciiLowerCaseFilter();
|
2171
|
+
Init_LowerCaseFilter();
|
2172
|
+
Init_StopFilter();
|
2173
|
+
Init_StemFilter();
|
2174
|
+
|
2175
|
+
Init_Analyzer();
|
2176
|
+
Init_AsciiLetterAnalyzer();
|
2177
|
+
Init_LetterAnalyzer();
|
2178
|
+
Init_AsciiWhiteSpaceAnalyzer();
|
2179
|
+
Init_WhiteSpaceAnalyzer();
|
2180
|
+
Init_AsciiStandardAnalyzer();
|
2181
|
+
Init_StandardAnalyzer();
|
2182
|
+
Init_PerFieldAnalyzer();
|
2183
|
+
Init_RegExpAnalyzer();
|
1175
2184
|
|
1176
2185
|
}
|