ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/w32_io.c
DELETED
@@ -1,150 +0,0 @@
|
|
1
|
-
#ifdef WIN32
|
2
|
-
|
3
|
-
#include "global.h"
|
4
|
-
#include "store.h"
|
5
|
-
#include <stdio.h>
|
6
|
-
#include <io.h>
|
7
|
-
#include <errno.h>
|
8
|
-
#include <string.h>
|
9
|
-
|
10
|
-
/**
|
11
|
-
* Create a filepath for a file in the store using the operating systems
|
12
|
-
* default file seperator.
|
13
|
-
*/
|
14
|
-
char *join_path(char *buf, const char *base, const char *filename)
|
15
|
-
{
|
16
|
-
sprintf(buf, "%s\\%s", base, filename);
|
17
|
-
return buf;
|
18
|
-
}
|
19
|
-
|
20
|
-
bool exists(char *path)
|
21
|
-
{
|
22
|
-
int fd = _open(path, 0);
|
23
|
-
if (fd < 0) {
|
24
|
-
if (errno != ENOENT) {
|
25
|
-
RAISE(IO_ERROR, strerror(errno));
|
26
|
-
}
|
27
|
-
return false;
|
28
|
-
}
|
29
|
-
_close(fd);
|
30
|
-
return true;
|
31
|
-
}
|
32
|
-
|
33
|
-
int fcount(char *path)
|
34
|
-
{
|
35
|
-
char buf[MAX_FILE_PATH];
|
36
|
-
struct _finddata_t fd;
|
37
|
-
intptr_t d;
|
38
|
-
int cnt = 0;
|
39
|
-
|
40
|
-
join_path(buf, path, "*");
|
41
|
-
|
42
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
43
|
-
RAISE(IO_ERROR, strerror(errno));
|
44
|
-
}
|
45
|
-
|
46
|
-
do {
|
47
|
-
if (fd.name[0] != '.') {
|
48
|
-
cnt++;
|
49
|
-
}
|
50
|
-
} while (_findnext(d, &fd) == 0);
|
51
|
-
_findclose(d);
|
52
|
-
|
53
|
-
return cnt;
|
54
|
-
}
|
55
|
-
|
56
|
-
void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
|
57
|
-
{
|
58
|
-
char buf[MAX_FILE_PATH];
|
59
|
-
struct _finddata_t fd;
|
60
|
-
intptr_t d;
|
61
|
-
join_path(buf, path, "*");
|
62
|
-
|
63
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
64
|
-
RAISE(IO_ERROR, strerror(errno));
|
65
|
-
}
|
66
|
-
|
67
|
-
while (_findnext(d, &fd) == 0) {
|
68
|
-
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
69
|
-
func(fd.name, arg);
|
70
|
-
}
|
71
|
-
}
|
72
|
-
_findclose(d);
|
73
|
-
}
|
74
|
-
|
75
|
-
/**
|
76
|
-
* Clear all the locks in the store.
|
77
|
-
*
|
78
|
-
* @param store the store to clear the locks from
|
79
|
-
* @throws IO_ERROR if there is an error opening the directory
|
80
|
-
*/
|
81
|
-
void fs_clear_locks(Store *store)
|
82
|
-
{
|
83
|
-
char buf[MAX_FILE_PATH];
|
84
|
-
struct _finddata_t fd;
|
85
|
-
intptr_t d;
|
86
|
-
join_path(buf, store->dir.path, "*");
|
87
|
-
|
88
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
89
|
-
RAISE(IO_ERROR, strerror(errno));
|
90
|
-
}
|
91
|
-
|
92
|
-
while (_findnext(d, &fd) == 0) {
|
93
|
-
if (file_is_lock(fd.name)) {
|
94
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
95
|
-
}
|
96
|
-
}
|
97
|
-
_findclose(d);
|
98
|
-
}
|
99
|
-
|
100
|
-
/**
|
101
|
-
* Clear all files from the store except the lock files.
|
102
|
-
*
|
103
|
-
* @param store the store to clear all the files from
|
104
|
-
* @throws IO_ERROR if there is an error deleting the files
|
105
|
-
*/
|
106
|
-
void fs_clear(Store *store)
|
107
|
-
{
|
108
|
-
char buf[MAX_FILE_PATH];
|
109
|
-
struct _finddata_t fd;
|
110
|
-
intptr_t d;
|
111
|
-
join_path(buf, store->dir.path, "*");
|
112
|
-
|
113
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
114
|
-
RAISE(IO_ERROR, strerror(errno));
|
115
|
-
}
|
116
|
-
|
117
|
-
while (_findnext(d, &fd) == 0) {
|
118
|
-
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
119
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
120
|
-
}
|
121
|
-
}
|
122
|
-
_findclose(d);
|
123
|
-
}
|
124
|
-
|
125
|
-
/**
|
126
|
-
* Clear all files from the store including the lock files.
|
127
|
-
*
|
128
|
-
* @param store the store to clear all the files from
|
129
|
-
* @throws IO_ERROR if there is an error deleting the files
|
130
|
-
*/
|
131
|
-
void fs_clear_all(Store *store)
|
132
|
-
{
|
133
|
-
char buf[MAX_FILE_PATH];
|
134
|
-
struct _finddata_t fd;
|
135
|
-
intptr_t d;
|
136
|
-
join_path(buf, store->dir.path, "*");
|
137
|
-
|
138
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
139
|
-
RAISE(IO_ERROR, strerror(errno));
|
140
|
-
}
|
141
|
-
|
142
|
-
while (_findnext(d, &fd) == 0) {
|
143
|
-
if (fd.name[0] != '.') {
|
144
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
145
|
-
}
|
146
|
-
}
|
147
|
-
_findclose(d);
|
148
|
-
}
|
149
|
-
|
150
|
-
#endif
|
data/lib/ferret/analysis.rb
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
# Documentation for Analysis
|
2
|
-
module Ferret::Analysis
|
3
|
-
end
|
4
|
-
|
5
|
-
require 'ferret/analysis/token'
|
6
|
-
require 'ferret/analysis/token_stream'
|
7
|
-
require 'ferret/analysis/tokenizers'
|
8
|
-
require 'ferret/analysis/standard_tokenizer'
|
9
|
-
require 'ferret/analysis/token_filters'
|
10
|
-
require 'ferret/analysis/word_list_loader'
|
11
|
-
require 'ferret/analysis/analyzers'
|
@@ -1,112 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# An Analyzer builds TokenStreams, which analyze text. It thus represents
|
3
|
-
# a policy for extracting index terms from text.
|
4
|
-
#
|
5
|
-
# Typical implementations first build a Tokenizer, which breaks the stream
|
6
|
-
# of characters from the Reader into raw Tokens. One or more TokenFilter s
|
7
|
-
# may then be applied to the output of the Tokenizer.
|
8
|
-
#
|
9
|
-
# The default Analyzer just creates a LowerCaseTokenizer which converts
|
10
|
-
# all text to lowercase tokens. See LowerCaseTokenizer for more details.
|
11
|
-
class Analyzer
|
12
|
-
# Creates a TokenStream which tokenizes all the text in the provided
|
13
|
-
# Reader. Override to allow Analyzer to choose strategy based on
|
14
|
-
# document and/or field.
|
15
|
-
# string:: the string representing the text in the field
|
16
|
-
# field:: name of the field. Not required.
|
17
|
-
def token_stream(field, string)
|
18
|
-
return LowerCaseTokenizer.new(string)
|
19
|
-
end
|
20
|
-
|
21
|
-
# Invoked before indexing a Field instance if
|
22
|
-
# terms have already been added to that field. This allows custom
|
23
|
-
# analyzers to place an automatic position increment gap between
|
24
|
-
# Field instances using the same field name. The default value
|
25
|
-
# position increment gap is 0. With a 0 position increment gap and
|
26
|
-
# the typical default token position increment of 1, all terms in a field,
|
27
|
-
# including across Field instances, are in successive positions, allowing
|
28
|
-
# exact PhraseQuery matches, for instance, across Field instance boundaries.
|
29
|
-
#
|
30
|
-
# field_name:: Field name being indexed.
|
31
|
-
# pos_inc_gap:: added to the next token emitted from
|
32
|
-
# #token_stream(String,Reader)
|
33
|
-
#
|
34
|
-
def pos_inc_gap(field_name)
|
35
|
-
return 0
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
# An Analyzer that uses WhiteSpaceTokenizer.
|
41
|
-
class WhiteSpaceAnalyzer < Analyzer
|
42
|
-
def token_stream(field, string)
|
43
|
-
return WhiteSpaceTokenizer.new(string)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Filters LetterTokenizer with LowerCaseFilter and StopFilter.
|
48
|
-
class StopAnalyzer < Analyzer
|
49
|
-
|
50
|
-
# An array containing some common English words that are not usually useful
|
51
|
-
# for searching.
|
52
|
-
ENGLISH_STOP_WORDS = [
|
53
|
-
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
|
54
|
-
"in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
|
55
|
-
"t", "that", "the", "their", "then", "there", "these",
|
56
|
-
"they", "this", "to", "was", "will", "with"
|
57
|
-
]
|
58
|
-
|
59
|
-
# Builds an analyzer which removes words in the provided array.
|
60
|
-
def initialize(stop_words = ENGLISH_STOP_WORDS)
|
61
|
-
@stop_words = stop_words
|
62
|
-
end
|
63
|
-
|
64
|
-
# Filters LowerCaseTokenizer with StopFilter.
|
65
|
-
def token_stream(field, string)
|
66
|
-
return StopFilter.new(LowerCaseTokenizer.new(string), @stop_words)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
# An Analyzer that filters LetterTokenizer with LowerCaseFilter.
|
71
|
-
# This analyzer subclasses the StopAnalyzer so you can add your own
|
72
|
-
# stoplist the same way. See StopAnalyzer.
|
73
|
-
class StandardAnalyzer < StopAnalyzer
|
74
|
-
def token_stream(field, string)
|
75
|
-
return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# This analyzer is used to facilitate scenarios where different
|
81
|
-
# fields require different analysis techniques. Use #add_analyzer
|
82
|
-
# to add a non-default analyzer on a field name basis.
|
83
|
-
# See tc_per_field_analyzer_wrapper for example usage.
|
84
|
-
class PerFieldAnalyzerWrapper < Analyzer
|
85
|
-
|
86
|
-
# Constructs with default analyzer.
|
87
|
-
#
|
88
|
-
# default_analyzer:: Any fields not specifically defined to use a
|
89
|
-
# different analyzer will use the one provided here.
|
90
|
-
def initialize(default_analyzer)
|
91
|
-
@default_analyzer = default_analyzer
|
92
|
-
@analyzers = {}
|
93
|
-
end
|
94
|
-
|
95
|
-
# Defines an analyzer to use for the specified field.
|
96
|
-
#
|
97
|
-
# field:: field name requiring a non-default analyzer.
|
98
|
-
# analyzer:: non-default analyzer to use for field
|
99
|
-
def add_analyzer(field, analyzer)
|
100
|
-
@analyzers[field] = analyzer
|
101
|
-
end
|
102
|
-
|
103
|
-
def token_stream(field, string)
|
104
|
-
analyzer = @analyzers[field]
|
105
|
-
if (analyzer == nil)
|
106
|
-
analyzer = @default_analyzer
|
107
|
-
end
|
108
|
-
|
109
|
-
return analyzer.token_stream(field, string)
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
if __FILE__ == $0
|
2
|
-
module Ferret
|
3
|
-
end
|
4
|
-
$:.unshift File.dirname(__FILE__)
|
5
|
-
require 'token_stream'
|
6
|
-
require 'tokenizers'
|
7
|
-
require 'token'
|
8
|
-
end
|
9
|
-
|
10
|
-
module Ferret::Analysis
|
11
|
-
# The standard tokenizer is an advanced tokenizer which tokenizes morst
|
12
|
-
# words correctly as well as tokenizing things like email addresses, web
|
13
|
-
# addresses, phone numbers, etc.
|
14
|
-
|
15
|
-
class StandardTokenizer < RegExpTokenizer
|
16
|
-
ALPHA = /[[:alpha:]_-]+/
|
17
|
-
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
18
|
-
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
|
-
P = /[_\/.,-]/
|
20
|
-
HASDIGIT = /\w*\d\w*/
|
21
|
-
TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
|
22
|
-
|\.(#{ALPHA}\.)+
|
23
|
-
|(@|\&)\w+([-.]\w+)*
|
24
|
-
|:\/\/\w+([-.\/]\w+)*
|
25
|
-
)
|
26
|
-
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
27
|
-
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
28
|
-
|(\.\w+)+
|
29
|
-
|
|
30
|
-
)
|
31
|
-
/x
|
32
|
-
|
33
|
-
ACRONYM_WORD = /^#{ACRONYM}$/
|
34
|
-
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
35
|
-
DOT = /\./
|
36
|
-
APOSTROPHE_S = /'[sS]$/
|
37
|
-
protected
|
38
|
-
|
39
|
-
# Collects only characters which are not spaces tabs or carraige returns
|
40
|
-
def token_re()
|
41
|
-
#/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
|
42
|
-
# This is a simplified version of the original Lucene standard
|
43
|
-
# tokenizer. I think it works better. I hope so anyway. Any way to
|
44
|
-
# do this more neatly?
|
45
|
-
TOKEN_RE
|
46
|
-
end
|
47
|
-
|
48
|
-
# stem the 's and remove the '.'s from acronyms
|
49
|
-
def normalize(str)
|
50
|
-
if str =~ ACRONYM_WORD
|
51
|
-
str.gsub!(DOT, '')
|
52
|
-
elsif str =~ APOSTROPHE_WORD
|
53
|
-
str.gsub!(APOSTROPHE_S, '')
|
54
|
-
end
|
55
|
-
str
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Add this so we can play around with the standard tokenizer
|
61
|
-
if __FILE__ == $0
|
62
|
-
st = "\033[7m"
|
63
|
-
en = "\033[m"
|
64
|
-
|
65
|
-
$stdin.each do |line|
|
66
|
-
stk = Ferret::Analysis::StandardTokenizer.new(line)
|
67
|
-
while tk = stk.next()
|
68
|
-
puts " <" + tk.text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
@@ -1,100 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# A Token is an occurence of a term from the text of a field. It consists
|
3
|
-
# of a term's text, the start and end offset of the term in the text of the
|
4
|
-
# field, and a type string.
|
5
|
-
#
|
6
|
-
# The start and end offsets permit applications to re-associate a token with
|
7
|
-
# its source text, e.g., to display highlighted query terms in a document
|
8
|
-
# browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
9
|
-
# display, etc.
|
10
|
-
#
|
11
|
-
# The type is an interned string, assigned by a lexical analyzer (a.k.a.
|
12
|
-
# tokenizer), naming the lexical or syntactic class that the token belongs
|
13
|
-
# to. For example an end of sentence marker token might be implemented with
|
14
|
-
# type "eos". The default token type is "word".
|
15
|
-
#
|
16
|
-
# start_offset:: is the position of the first character corresponding to
|
17
|
-
# this token in the source text
|
18
|
-
# end_offset:: is equal to one greater than the position of the last
|
19
|
-
# character corresponding of this token Note that the
|
20
|
-
# difference between @end_offset and @start_offset may not be
|
21
|
-
# equal to @text.length(), as the term text may have been
|
22
|
-
# altered by a stemmer or some other filter.
|
23
|
-
class Token
|
24
|
-
include Comparable
|
25
|
-
attr_accessor :text
|
26
|
-
attr_reader :pos_inc, :start_offset, :end_offset, :type
|
27
|
-
|
28
|
-
# Constructs a Token with the given term text, and start & end offsets.
|
29
|
-
# The type defaults to "word."
|
30
|
-
def initialize(txt, so, eo, pos_inc=1, typ="word")
|
31
|
-
@text = txt
|
32
|
-
@start_offset = so
|
33
|
-
@end_offset = eo
|
34
|
-
@type = typ # lexical type
|
35
|
-
@pos_inc = pos_inc
|
36
|
-
end
|
37
|
-
|
38
|
-
def set!(txt, so, eo)
|
39
|
-
@text = txt
|
40
|
-
@start_offset = so
|
41
|
-
@end_offset = eo
|
42
|
-
self
|
43
|
-
end
|
44
|
-
|
45
|
-
def eql?(o)
|
46
|
-
return (o.instance_of?(Token) and @start_offset == o.start_offset and
|
47
|
-
@end_offset == o.end_offset and @text == o.text)
|
48
|
-
end
|
49
|
-
alias :== :eql?
|
50
|
-
|
51
|
-
# Tokens are sorted by the position in the text at which they occur, ie
|
52
|
-
# the start_offset. If two tokens have the same start offset, (see
|
53
|
-
# pos_inc=) then, they are sorted by the end_offset and then
|
54
|
-
# lexically by the token text.
|
55
|
-
def <=>(o)
|
56
|
-
r = @start_offset <=> o.start_offset
|
57
|
-
return r if r != 0
|
58
|
-
r = @end_offset <=> o.end_offset
|
59
|
-
return r if r != 0
|
60
|
-
r = @text <=> o.text
|
61
|
-
return r
|
62
|
-
end
|
63
|
-
|
64
|
-
# Set the position increment. This determines the position of this token
|
65
|
-
# relative to the previous Token in a TokenStream, used in phrase
|
66
|
-
# searching.
|
67
|
-
#
|
68
|
-
# The default value is one.
|
69
|
-
#
|
70
|
-
# Some common uses for this are:
|
71
|
-
#
|
72
|
-
# * Set it to zero to put multiple terms in the same position. This is
|
73
|
-
# useful if, e.g., a word has multiple stems. Searches for phrases
|
74
|
-
# including either stem will match. In this case, all but the first
|
75
|
-
# stem's increment should be set to zero: the increment of the first
|
76
|
-
# instance should be one. Repeating a token with an increment of zero
|
77
|
-
# can also be used to boost the scores of matches on that token.
|
78
|
-
#
|
79
|
-
# * Set it to values greater than one to inhibit exact phrase matches.
|
80
|
-
# If, for example, one does not want phrases to match across removed
|
81
|
-
# stop words, then one could build a stop word filter that removes stop
|
82
|
-
# words and also sets the increment to the number of stop words removed
|
83
|
-
# before each non-stop word. Then exact phrase queries will only match
|
84
|
-
# when the terms occur with no intervening stop words.
|
85
|
-
def pos_inc=(pos_inc)
|
86
|
-
if (pos_inc < 0)
|
87
|
-
raise ArgumentError, "Increment must be zero or greater: " + pos_inc
|
88
|
-
end
|
89
|
-
@pos_inc = pos_inc
|
90
|
-
end
|
91
|
-
|
92
|
-
# Returns a string representation of the token with all the attributes.
|
93
|
-
def to_s
|
94
|
-
buf = "#{text}:#{start_offset}->#{end_offset}"
|
95
|
-
buf << "(pos_inc=#{@pos_inc})" if (@pos_inc != 1)
|
96
|
-
buf << "(type=#{@type})" if (@type != "word")
|
97
|
-
buf
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|