ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/w32_io.c
DELETED
@@ -1,150 +0,0 @@
|
|
1
|
-
#ifdef WIN32
|
2
|
-
|
3
|
-
#include "global.h"
|
4
|
-
#include "store.h"
|
5
|
-
#include <stdio.h>
|
6
|
-
#include <io.h>
|
7
|
-
#include <errno.h>
|
8
|
-
#include <string.h>
|
9
|
-
|
10
|
-
/**
|
11
|
-
* Create a filepath for a file in the store using the operating systems
|
12
|
-
* default file seperator.
|
13
|
-
*/
|
14
|
-
char *join_path(char *buf, const char *base, const char *filename)
|
15
|
-
{
|
16
|
-
sprintf(buf, "%s\\%s", base, filename);
|
17
|
-
return buf;
|
18
|
-
}
|
19
|
-
|
20
|
-
bool exists(char *path)
|
21
|
-
{
|
22
|
-
int fd = _open(path, 0);
|
23
|
-
if (fd < 0) {
|
24
|
-
if (errno != ENOENT) {
|
25
|
-
RAISE(IO_ERROR, strerror(errno));
|
26
|
-
}
|
27
|
-
return false;
|
28
|
-
}
|
29
|
-
_close(fd);
|
30
|
-
return true;
|
31
|
-
}
|
32
|
-
|
33
|
-
int fcount(char *path)
|
34
|
-
{
|
35
|
-
char buf[MAX_FILE_PATH];
|
36
|
-
struct _finddata_t fd;
|
37
|
-
intptr_t d;
|
38
|
-
int cnt = 0;
|
39
|
-
|
40
|
-
join_path(buf, path, "*");
|
41
|
-
|
42
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
43
|
-
RAISE(IO_ERROR, strerror(errno));
|
44
|
-
}
|
45
|
-
|
46
|
-
do {
|
47
|
-
if (fd.name[0] != '.') {
|
48
|
-
cnt++;
|
49
|
-
}
|
50
|
-
} while (_findnext(d, &fd) == 0);
|
51
|
-
_findclose(d);
|
52
|
-
|
53
|
-
return cnt;
|
54
|
-
}
|
55
|
-
|
56
|
-
void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
|
57
|
-
{
|
58
|
-
char buf[MAX_FILE_PATH];
|
59
|
-
struct _finddata_t fd;
|
60
|
-
intptr_t d;
|
61
|
-
join_path(buf, path, "*");
|
62
|
-
|
63
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
64
|
-
RAISE(IO_ERROR, strerror(errno));
|
65
|
-
}
|
66
|
-
|
67
|
-
while (_findnext(d, &fd) == 0) {
|
68
|
-
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
69
|
-
func(fd.name, arg);
|
70
|
-
}
|
71
|
-
}
|
72
|
-
_findclose(d);
|
73
|
-
}
|
74
|
-
|
75
|
-
/**
|
76
|
-
* Clear all the locks in the store.
|
77
|
-
*
|
78
|
-
* @param store the store to clear the locks from
|
79
|
-
* @throws IO_ERROR if there is an error opening the directory
|
80
|
-
*/
|
81
|
-
void fs_clear_locks(Store *store)
|
82
|
-
{
|
83
|
-
char buf[MAX_FILE_PATH];
|
84
|
-
struct _finddata_t fd;
|
85
|
-
intptr_t d;
|
86
|
-
join_path(buf, store->dir.path, "*");
|
87
|
-
|
88
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
89
|
-
RAISE(IO_ERROR, strerror(errno));
|
90
|
-
}
|
91
|
-
|
92
|
-
while (_findnext(d, &fd) == 0) {
|
93
|
-
if (file_is_lock(fd.name)) {
|
94
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
95
|
-
}
|
96
|
-
}
|
97
|
-
_findclose(d);
|
98
|
-
}
|
99
|
-
|
100
|
-
/**
|
101
|
-
* Clear all files from the store except the lock files.
|
102
|
-
*
|
103
|
-
* @param store the store to clear all the files from
|
104
|
-
* @throws IO_ERROR if there is an error deleting the files
|
105
|
-
*/
|
106
|
-
void fs_clear(Store *store)
|
107
|
-
{
|
108
|
-
char buf[MAX_FILE_PATH];
|
109
|
-
struct _finddata_t fd;
|
110
|
-
intptr_t d;
|
111
|
-
join_path(buf, store->dir.path, "*");
|
112
|
-
|
113
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
114
|
-
RAISE(IO_ERROR, strerror(errno));
|
115
|
-
}
|
116
|
-
|
117
|
-
while (_findnext(d, &fd) == 0) {
|
118
|
-
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
119
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
120
|
-
}
|
121
|
-
}
|
122
|
-
_findclose(d);
|
123
|
-
}
|
124
|
-
|
125
|
-
/**
|
126
|
-
* Clear all files from the store including the lock files.
|
127
|
-
*
|
128
|
-
* @param store the store to clear all the files from
|
129
|
-
* @throws IO_ERROR if there is an error deleting the files
|
130
|
-
*/
|
131
|
-
void fs_clear_all(Store *store)
|
132
|
-
{
|
133
|
-
char buf[MAX_FILE_PATH];
|
134
|
-
struct _finddata_t fd;
|
135
|
-
intptr_t d;
|
136
|
-
join_path(buf, store->dir.path, "*");
|
137
|
-
|
138
|
-
if ((d = _findfirst(buf, &fd)) < 0) {
|
139
|
-
RAISE(IO_ERROR, strerror(errno));
|
140
|
-
}
|
141
|
-
|
142
|
-
while (_findnext(d, &fd) == 0) {
|
143
|
-
if (fd.name[0] != '.') {
|
144
|
-
remove(join_path(buf, store->dir.path, fd.name));
|
145
|
-
}
|
146
|
-
}
|
147
|
-
_findclose(d);
|
148
|
-
}
|
149
|
-
|
150
|
-
#endif
|
data/lib/ferret/analysis.rb
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
# Documentation for Analysis
|
2
|
-
module Ferret::Analysis
|
3
|
-
end
|
4
|
-
|
5
|
-
require 'ferret/analysis/token'
|
6
|
-
require 'ferret/analysis/token_stream'
|
7
|
-
require 'ferret/analysis/tokenizers'
|
8
|
-
require 'ferret/analysis/standard_tokenizer'
|
9
|
-
require 'ferret/analysis/token_filters'
|
10
|
-
require 'ferret/analysis/word_list_loader'
|
11
|
-
require 'ferret/analysis/analyzers'
|
@@ -1,112 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# An Analyzer builds TokenStreams, which analyze text. It thus represents
|
3
|
-
# a policy for extracting index terms from text.
|
4
|
-
#
|
5
|
-
# Typical implementations first build a Tokenizer, which breaks the stream
|
6
|
-
# of characters from the Reader into raw Tokens. One or more TokenFilter s
|
7
|
-
# may then be applied to the output of the Tokenizer.
|
8
|
-
#
|
9
|
-
# The default Analyzer just creates a LowerCaseTokenizer which converts
|
10
|
-
# all text to lowercase tokens. See LowerCaseTokenizer for more details.
|
11
|
-
class Analyzer
|
12
|
-
# Creates a TokenStream which tokenizes all the text in the provided
|
13
|
-
# Reader. Override to allow Analyzer to choose strategy based on
|
14
|
-
# document and/or field.
|
15
|
-
# string:: the string representing the text in the field
|
16
|
-
# field:: name of the field. Not required.
|
17
|
-
def token_stream(field, string)
|
18
|
-
return LowerCaseTokenizer.new(string)
|
19
|
-
end
|
20
|
-
|
21
|
-
# Invoked before indexing a Field instance if
|
22
|
-
# terms have already been added to that field. This allows custom
|
23
|
-
# analyzers to place an automatic position increment gap between
|
24
|
-
# Field instances using the same field name. The default value
|
25
|
-
# position increment gap is 0. With a 0 position increment gap and
|
26
|
-
# the typical default token position increment of 1, all terms in a field,
|
27
|
-
# including across Field instances, are in successive positions, allowing
|
28
|
-
# exact PhraseQuery matches, for instance, across Field instance boundaries.
|
29
|
-
#
|
30
|
-
# field_name:: Field name being indexed.
|
31
|
-
# pos_inc_gap:: added to the next token emitted from
|
32
|
-
# #token_stream(String,Reader)
|
33
|
-
#
|
34
|
-
def pos_inc_gap(field_name)
|
35
|
-
return 0
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
# An Analyzer that uses WhiteSpaceTokenizer.
|
41
|
-
class WhiteSpaceAnalyzer < Analyzer
|
42
|
-
def token_stream(field, string)
|
43
|
-
return WhiteSpaceTokenizer.new(string)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Filters LetterTokenizer with LowerCaseFilter and StopFilter.
|
48
|
-
class StopAnalyzer < Analyzer
|
49
|
-
|
50
|
-
# An array containing some common English words that are not usually useful
|
51
|
-
# for searching.
|
52
|
-
ENGLISH_STOP_WORDS = [
|
53
|
-
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
|
54
|
-
"in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
|
55
|
-
"t", "that", "the", "their", "then", "there", "these",
|
56
|
-
"they", "this", "to", "was", "will", "with"
|
57
|
-
]
|
58
|
-
|
59
|
-
# Builds an analyzer which removes words in the provided array.
|
60
|
-
def initialize(stop_words = ENGLISH_STOP_WORDS)
|
61
|
-
@stop_words = stop_words
|
62
|
-
end
|
63
|
-
|
64
|
-
# Filters LowerCaseTokenizer with StopFilter.
|
65
|
-
def token_stream(field, string)
|
66
|
-
return StopFilter.new(LowerCaseTokenizer.new(string), @stop_words)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
# An Analyzer that filters LetterTokenizer with LowerCaseFilter.
|
71
|
-
# This analyzer subclasses the StopAnalyzer so you can add your own
|
72
|
-
# stoplist the same way. See StopAnalyzer.
|
73
|
-
class StandardAnalyzer < StopAnalyzer
|
74
|
-
def token_stream(field, string)
|
75
|
-
return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# This analyzer is used to facilitate scenarios where different
|
81
|
-
# fields require different analysis techniques. Use #add_analyzer
|
82
|
-
# to add a non-default analyzer on a field name basis.
|
83
|
-
# See tc_per_field_analyzer_wrapper for example usage.
|
84
|
-
class PerFieldAnalyzerWrapper < Analyzer
|
85
|
-
|
86
|
-
# Constructs with default analyzer.
|
87
|
-
#
|
88
|
-
# default_analyzer:: Any fields not specifically defined to use a
|
89
|
-
# different analyzer will use the one provided here.
|
90
|
-
def initialize(default_analyzer)
|
91
|
-
@default_analyzer = default_analyzer
|
92
|
-
@analyzers = {}
|
93
|
-
end
|
94
|
-
|
95
|
-
# Defines an analyzer to use for the specified field.
|
96
|
-
#
|
97
|
-
# field:: field name requiring a non-default analyzer.
|
98
|
-
# analyzer:: non-default analyzer to use for field
|
99
|
-
def add_analyzer(field, analyzer)
|
100
|
-
@analyzers[field] = analyzer
|
101
|
-
end
|
102
|
-
|
103
|
-
def token_stream(field, string)
|
104
|
-
analyzer = @analyzers[field]
|
105
|
-
if (analyzer == nil)
|
106
|
-
analyzer = @default_analyzer
|
107
|
-
end
|
108
|
-
|
109
|
-
return analyzer.token_stream(field, string)
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
if __FILE__ == $0
|
2
|
-
module Ferret
|
3
|
-
end
|
4
|
-
$:.unshift File.dirname(__FILE__)
|
5
|
-
require 'token_stream'
|
6
|
-
require 'tokenizers'
|
7
|
-
require 'token'
|
8
|
-
end
|
9
|
-
|
10
|
-
module Ferret::Analysis
|
11
|
-
# The standard tokenizer is an advanced tokenizer which tokenizes morst
|
12
|
-
# words correctly as well as tokenizing things like email addresses, web
|
13
|
-
# addresses, phone numbers, etc.
|
14
|
-
|
15
|
-
class StandardTokenizer < RegExpTokenizer
|
16
|
-
ALPHA = /[[:alpha:]_-]+/
|
17
|
-
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
18
|
-
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
|
-
P = /[_\/.,-]/
|
20
|
-
HASDIGIT = /\w*\d\w*/
|
21
|
-
TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
|
22
|
-
|\.(#{ALPHA}\.)+
|
23
|
-
|(@|\&)\w+([-.]\w+)*
|
24
|
-
|:\/\/\w+([-.\/]\w+)*
|
25
|
-
)
|
26
|
-
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
27
|
-
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
28
|
-
|(\.\w+)+
|
29
|
-
|
|
30
|
-
)
|
31
|
-
/x
|
32
|
-
|
33
|
-
ACRONYM_WORD = /^#{ACRONYM}$/
|
34
|
-
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
35
|
-
DOT = /\./
|
36
|
-
APOSTROPHE_S = /'[sS]$/
|
37
|
-
protected
|
38
|
-
|
39
|
-
# Collects only characters which are not spaces tabs or carraige returns
|
40
|
-
def token_re()
|
41
|
-
#/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
|
42
|
-
# This is a simplified version of the original Lucene standard
|
43
|
-
# tokenizer. I think it works better. I hope so anyway. Any way to
|
44
|
-
# do this more neatly?
|
45
|
-
TOKEN_RE
|
46
|
-
end
|
47
|
-
|
48
|
-
# stem the 's and remove the '.'s from acronyms
|
49
|
-
def normalize(str)
|
50
|
-
if str =~ ACRONYM_WORD
|
51
|
-
str.gsub!(DOT, '')
|
52
|
-
elsif str =~ APOSTROPHE_WORD
|
53
|
-
str.gsub!(APOSTROPHE_S, '')
|
54
|
-
end
|
55
|
-
str
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Add this so we can play around with the standard tokenizer
|
61
|
-
if __FILE__ == $0
|
62
|
-
st = "\033[7m"
|
63
|
-
en = "\033[m"
|
64
|
-
|
65
|
-
$stdin.each do |line|
|
66
|
-
stk = Ferret::Analysis::StandardTokenizer.new(line)
|
67
|
-
while tk = stk.next()
|
68
|
-
puts " <" + tk.text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
@@ -1,100 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# A Token is an occurence of a term from the text of a field. It consists
|
3
|
-
# of a term's text, the start and end offset of the term in the text of the
|
4
|
-
# field, and a type string.
|
5
|
-
#
|
6
|
-
# The start and end offsets permit applications to re-associate a token with
|
7
|
-
# its source text, e.g., to display highlighted query terms in a document
|
8
|
-
# browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
9
|
-
# display, etc.
|
10
|
-
#
|
11
|
-
# The type is an interned string, assigned by a lexical analyzer (a.k.a.
|
12
|
-
# tokenizer), naming the lexical or syntactic class that the token belongs
|
13
|
-
# to. For example an end of sentence marker token might be implemented with
|
14
|
-
# type "eos". The default token type is "word".
|
15
|
-
#
|
16
|
-
# start_offset:: is the position of the first character corresponding to
|
17
|
-
# this token in the source text
|
18
|
-
# end_offset:: is equal to one greater than the position of the last
|
19
|
-
# character corresponding of this token Note that the
|
20
|
-
# difference between @end_offset and @start_offset may not be
|
21
|
-
# equal to @text.length(), as the term text may have been
|
22
|
-
# altered by a stemmer or some other filter.
|
23
|
-
class Token
|
24
|
-
include Comparable
|
25
|
-
attr_accessor :text
|
26
|
-
attr_reader :pos_inc, :start_offset, :end_offset, :type
|
27
|
-
|
28
|
-
# Constructs a Token with the given term text, and start & end offsets.
|
29
|
-
# The type defaults to "word."
|
30
|
-
def initialize(txt, so, eo, pos_inc=1, typ="word")
|
31
|
-
@text = txt
|
32
|
-
@start_offset = so
|
33
|
-
@end_offset = eo
|
34
|
-
@type = typ # lexical type
|
35
|
-
@pos_inc = pos_inc
|
36
|
-
end
|
37
|
-
|
38
|
-
def set!(txt, so, eo)
|
39
|
-
@text = txt
|
40
|
-
@start_offset = so
|
41
|
-
@end_offset = eo
|
42
|
-
self
|
43
|
-
end
|
44
|
-
|
45
|
-
def eql?(o)
|
46
|
-
return (o.instance_of?(Token) and @start_offset == o.start_offset and
|
47
|
-
@end_offset == o.end_offset and @text == o.text)
|
48
|
-
end
|
49
|
-
alias :== :eql?
|
50
|
-
|
51
|
-
# Tokens are sorted by the position in the text at which they occur, ie
|
52
|
-
# the start_offset. If two tokens have the same start offset, (see
|
53
|
-
# pos_inc=) then, they are sorted by the end_offset and then
|
54
|
-
# lexically by the token text.
|
55
|
-
def <=>(o)
|
56
|
-
r = @start_offset <=> o.start_offset
|
57
|
-
return r if r != 0
|
58
|
-
r = @end_offset <=> o.end_offset
|
59
|
-
return r if r != 0
|
60
|
-
r = @text <=> o.text
|
61
|
-
return r
|
62
|
-
end
|
63
|
-
|
64
|
-
# Set the position increment. This determines the position of this token
|
65
|
-
# relative to the previous Token in a TokenStream, used in phrase
|
66
|
-
# searching.
|
67
|
-
#
|
68
|
-
# The default value is one.
|
69
|
-
#
|
70
|
-
# Some common uses for this are:
|
71
|
-
#
|
72
|
-
# * Set it to zero to put multiple terms in the same position. This is
|
73
|
-
# useful if, e.g., a word has multiple stems. Searches for phrases
|
74
|
-
# including either stem will match. In this case, all but the first
|
75
|
-
# stem's increment should be set to zero: the increment of the first
|
76
|
-
# instance should be one. Repeating a token with an increment of zero
|
77
|
-
# can also be used to boost the scores of matches on that token.
|
78
|
-
#
|
79
|
-
# * Set it to values greater than one to inhibit exact phrase matches.
|
80
|
-
# If, for example, one does not want phrases to match across removed
|
81
|
-
# stop words, then one could build a stop word filter that removes stop
|
82
|
-
# words and also sets the increment to the number of stop words removed
|
83
|
-
# before each non-stop word. Then exact phrase queries will only match
|
84
|
-
# when the terms occur with no intervening stop words.
|
85
|
-
def pos_inc=(pos_inc)
|
86
|
-
if (pos_inc < 0)
|
87
|
-
raise ArgumentError, "Increment must be zero or greater: " + pos_inc
|
88
|
-
end
|
89
|
-
@pos_inc = pos_inc
|
90
|
-
end
|
91
|
-
|
92
|
-
# Returns a string representation of the token with all the attributes.
|
93
|
-
def to_s
|
94
|
-
buf = "#{text}:#{start_offset}->#{end_offset}"
|
95
|
-
buf << "(pos_inc=#{@pos_inc})" if (@pos_inc != 1)
|
96
|
-
buf << "(type=#{@type})" if (@type != "word")
|
97
|
-
buf
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|