ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_fuzzy.c
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
|
+
#include "helper.h"
|
3
4
|
|
4
5
|
/****************************************************************************
|
5
6
|
*
|
@@ -10,101 +11,106 @@
|
|
10
11
|
*
|
11
12
|
****************************************************************************/
|
12
13
|
|
13
|
-
|
14
|
-
int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
|
14
|
+
static inline int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
|
15
15
|
{
|
16
|
-
|
16
|
+
return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
|
17
17
|
}
|
18
18
|
|
19
|
-
void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
19
|
+
static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
20
20
|
{
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
int i;
|
22
|
+
for (i = 0; i < TYPICAL_LONGEST_WORD; i++) {
|
23
|
+
fuzq->max_distances[i] = fuzq_calculate_max_distance(fuzq, i);
|
24
|
+
}
|
25
25
|
}
|
26
26
|
|
27
|
-
int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
27
|
+
static inline int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
28
|
{
|
29
|
-
|
30
|
-
|
29
|
+
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
|
30
|
+
: fuzq_calculate_max_distance(fuzq, m);
|
31
31
|
}
|
32
32
|
|
33
|
-
|
33
|
+
/**
|
34
|
+
* The following algorithm is taken from Bob Carpenter's FuzzyTermEnum
|
35
|
+
* implentation here;
|
36
|
+
*
|
37
|
+
* http://mail-archives.apache.org/mod_mbox/lucene-java-dev/200606.mbox/%3c448F0E8C.3050901@alias-i.com%3e
|
38
|
+
*/
|
39
|
+
float fuzq_score(FuzzyQuery *fuzq, const char *target)
|
34
40
|
{
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
46
|
-
if (m == 0) {
|
47
|
-
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
|
48
|
-
}
|
49
|
-
|
50
|
-
max_distance = fuzq_get_max_distance(fuzq, m);
|
51
|
-
|
52
|
-
//printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n, fuzq->text, target);
|
53
|
-
if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
|
54
|
-
/* Just adding the characters of m to n or vice-versa results in too many
|
55
|
-
* edits for example "pre" length is 3 and "prefixes" length is 8. We can
|
56
|
-
* see that given this optimal circumstance, the edit distance cannot be
|
57
|
-
* less than 5 which is 8-3 or more precisesly Math.abs(3-8). If our
|
58
|
-
* maximum edit distance is 4, then we can discard this word without
|
59
|
-
* looking at it. */
|
60
|
-
return 0.0f;
|
61
|
-
}
|
62
|
-
|
63
|
-
/* Let's make sure we have enough room in our array to do the distance
|
64
|
-
* calculations. */
|
65
|
-
if (((m+1) * (n+1)) >= fuzq->da_capa) {
|
66
|
-
fuzq->da_capa = ((m+1) * (n+1)) * 2;
|
67
|
-
REALLOC_N(fuzq->da, int, fuzq->da_capa);
|
68
|
-
d = fuzq->da;
|
69
|
-
}
|
70
|
-
|
71
|
-
/* init matrix d */
|
72
|
-
for (i = 0; i <= n; i++) d[i + n * 0] = i;
|
73
|
-
for (j = 0; j <= m; j++) d[0 + n * j] = j;
|
74
|
-
|
75
|
-
/* start computing edit distance */
|
76
|
-
for (i = 1; i <= n; i++) {
|
77
|
-
int best_pos_ed_dist = m;
|
78
|
-
char s_i = text[i - 1];
|
79
|
-
for (j = 1; j <= m; j++) {
|
80
|
-
if (s_i != target[j-1]) {
|
81
|
-
d[i + n*j] = min3(d[i-1 + n*j], d[i + n*(j-1)], d[i-1 + n*(j-1)])+1;
|
82
|
-
} else {
|
83
|
-
d[i + n*j] = min3(d[i-1 + n*j]+1, d[i + n*(j-1)]+1, d[i-1 + n*(j-1)]);
|
84
|
-
}
|
85
|
-
best_pos_ed_dist = min2(best_pos_ed_dist, d[i + n*j]);
|
41
|
+
const int m = (int)strlen(target);
|
42
|
+
const int n = fuzq->text_len;
|
43
|
+
|
44
|
+
if (n == 0) {
|
45
|
+
/* we don't have anything to compare. That means if we just add
|
46
|
+
* the letters for m we get the new word */
|
47
|
+
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) m / fuzq->pre_len);
|
48
|
+
}
|
49
|
+
else if (m == 0) {
|
50
|
+
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
|
86
51
|
}
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
52
|
+
else {
|
53
|
+
int i, j, prune;
|
54
|
+
int *d_curr, *d_prev;
|
55
|
+
const char *text = fuzq->text;
|
56
|
+
const int max_distance = fuzq_get_max_distance(fuzq, m);
|
57
|
+
|
58
|
+
/*
|
59
|
+
printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n,
|
60
|
+
fuzq->text, target);
|
61
|
+
*/
|
62
|
+
if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
|
63
|
+
/* Just adding the characters of m to n or vice-versa results in
|
64
|
+
* too many edits for example "pre" length is 3 and "prefixes"
|
65
|
+
* length is 8. We can see that given this optimal circumstance,
|
66
|
+
* the edit distance cannot be less than 5 which is 8-3 or more
|
67
|
+
* precisesly Math.abs(3-8). If our maximum edit distance is 4,
|
68
|
+
* then we can discard this word without looking at it. */
|
69
|
+
return 0.0f;
|
70
|
+
}
|
71
|
+
|
72
|
+
d_curr = fuzq->da;
|
73
|
+
d_prev = d_curr + n + 1;
|
74
|
+
|
75
|
+
/* init array */
|
76
|
+
for (j = 0; j <= n; j++) {
|
77
|
+
d_curr[j] = j;
|
78
|
+
}
|
79
|
+
|
80
|
+
/* start computing edit distance */
|
81
|
+
for (i = 0; i < m;) {
|
82
|
+
char s_i = target[i];
|
83
|
+
/* swap d_current into d_prev */
|
84
|
+
int *d_tmp = d_prev;
|
85
|
+
d_prev = d_curr;
|
86
|
+
d_curr = d_tmp;
|
87
|
+
prune = (d_curr[0] = ++i) > max_distance;
|
88
|
+
|
89
|
+
for (j = 0; j < n; j++) {
|
90
|
+
d_curr[j + 1] = (s_i == text[j])
|
91
|
+
? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
|
92
|
+
: min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
|
93
|
+
if (prune && d_curr[j + 1] <= max_distance) {
|
94
|
+
prune = false;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
if (prune) {
|
98
|
+
return 0.0f;
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
/*
|
103
|
+
printf("<%f, d_curr[n] = %d min_len = %d>",
|
104
|
+
1.0f - ((float)d_curr[m] / (float) (fuzq->pre_len + min2(n, m))),
|
105
|
+
d_curr[m], fuzq->pre_len + min2(n, m));
|
106
|
+
*/
|
107
|
+
|
108
|
+
/* this will return less than 0.0 when the edit distance is greater
|
109
|
+
* than the number of characters in the shorter word. but this was
|
110
|
+
* the formula that was previously used in FuzzyTermEnum, so it has
|
111
|
+
* not been changed (even though min_sim must be greater than 0.0) */
|
112
|
+
return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
|
99
113
|
}
|
100
|
-
}
|
101
|
-
//printf("<%f, d[n + m*m] = %d min_len = %d>", 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len + min(n, m))), d[n + m*m], fuzq->pre_len + min(n, m));
|
102
|
-
|
103
|
-
/* this will return less than 0.0 when the edit distance is greater than the
|
104
|
-
* number of characters in the shorter word. but this was the formula that
|
105
|
-
* was previously used in FuzzyTermEnum, so it has not been changed (even
|
106
|
-
* though min_sim must be greater than 0.0) */
|
107
|
-
return 1.0f - ((float)d[n + n*m] / (float) (fuzq->pre_len + min2(n, m)));
|
108
114
|
}
|
109
115
|
|
110
116
|
/****************************************************************************
|
@@ -113,192 +119,150 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
113
119
|
*
|
114
120
|
****************************************************************************/
|
115
121
|
|
116
|
-
|
117
|
-
{
|
118
|
-
char *buffer, *bptr;
|
119
|
-
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
120
|
-
Term *term = fuzq->term;
|
121
|
-
int tlen = (int)strlen(term->text);
|
122
|
-
int flen = (int)strlen(term->field);
|
123
|
-
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
124
|
-
|
125
|
-
if (strcmp(term->field, field) != 0) {
|
126
|
-
sprintf(bptr, "%s:", term->field);
|
127
|
-
bptr += strlen(term->field) + 1;
|
128
|
-
}
|
129
|
-
sprintf(bptr, "%s~", term->text);
|
130
|
-
bptr += strlen(bptr);
|
131
|
-
if (fuzq->min_sim != 0.5) {
|
132
|
-
dbl_to_s(bptr, fuzq->min_sim);
|
133
|
-
bptr += strlen(bptr);
|
134
|
-
}
|
135
|
-
if (self->boost != 1.0) {
|
136
|
-
*bptr = '^';
|
137
|
-
dbl_to_s(++bptr, self->boost);
|
138
|
-
}
|
139
|
-
return buffer;
|
140
|
-
}
|
122
|
+
#define FzQ(query) ((FuzzyQuery *)(query))
|
141
123
|
|
142
|
-
|
143
|
-
Term *term;
|
144
|
-
float score;
|
145
|
-
} ScoredTerm;
|
146
|
-
|
147
|
-
bool scored_term_less_than(void *p1, void *p2)
|
124
|
+
static char *fuzq_to_s(Query *self, const char *curr_field)
|
148
125
|
{
|
149
|
-
|
150
|
-
|
126
|
+
char *buffer, *bptr;
|
127
|
+
char *term = FzQ(self)->term;
|
128
|
+
char *field = FzQ(self)->field;
|
129
|
+
int tlen = (int)strlen(term);
|
130
|
+
int flen = (int)strlen(field);
|
131
|
+
bptr = buffer = ALLOC_N(char, tlen + flen + 70);
|
132
|
+
|
133
|
+
if (strcmp(curr_field, field) != 0) {
|
134
|
+
sprintf(bptr, "%s:", field);
|
135
|
+
bptr += flen + 1;
|
136
|
+
}
|
151
137
|
|
152
|
-
|
153
|
-
|
138
|
+
sprintf(bptr, "%s~", term);
|
139
|
+
bptr += tlen + 1;
|
140
|
+
if (FzQ(self)->min_sim != 0.5) {
|
141
|
+
dbl_to_s(bptr, FzQ(self)->min_sim);
|
142
|
+
bptr += strlen(bptr);
|
143
|
+
}
|
154
144
|
|
155
|
-
|
156
|
-
|
145
|
+
if (self->boost != 1.0) {
|
146
|
+
*bptr = '^';
|
147
|
+
dbl_to_s(++bptr, self->boost);
|
148
|
+
}
|
157
149
|
|
158
|
-
|
159
|
-
{
|
160
|
-
term_destroy(self->term);
|
161
|
-
free(self);
|
150
|
+
return buffer;
|
162
151
|
}
|
163
152
|
|
164
|
-
|
153
|
+
static Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
165
154
|
{
|
166
|
-
|
167
|
-
|
168
|
-
self->score = score;
|
169
|
-
return self;
|
170
|
-
}
|
155
|
+
Query *q;
|
156
|
+
FuzzyQuery *fuzq = FzQ(self);
|
171
157
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
Term *term = fuzq->term;
|
179
|
-
char *text = term->text;
|
180
|
-
char *field = term->field;
|
181
|
-
Term prefix_term;
|
182
|
-
prefix_term.field = field;
|
183
|
-
if (fuzq->pre_len >= (int)strlen(text)) {
|
184
|
-
q = tq_create(term_clone(term));
|
185
|
-
} else {
|
186
|
-
PriorityQueue *term_pq;
|
187
|
-
TermEnum *te;
|
188
|
-
Term prefix_term;
|
189
|
-
char *prefix = NULL;
|
190
|
-
int pre_len = fuzq->pre_len;
|
191
|
-
ScoredTerm *scored_term;
|
192
|
-
|
193
|
-
q = bq_create(true);
|
194
|
-
|
195
|
-
term_pq = pq_create(((BooleanQuery *)q->data)->max_clause_cnt,
|
196
|
-
&scored_term_less_than);
|
197
|
-
term_pq->free_elem = (free_ft)&scored_term_destroy;
|
198
|
-
|
199
|
-
prefix_term.field = field;
|
200
|
-
prefix_term.text = (char *)EMPTY_STRING;
|
201
|
-
if (pre_len >= 0) {
|
202
|
-
prefix = ALLOC_N(char, pre_len + 1);
|
203
|
-
strncpy(prefix, text, pre_len);
|
204
|
-
prefix_term.text = prefix;
|
205
|
-
prefix_term.text[pre_len] = '\0';
|
158
|
+
const char *term = fuzq->term;
|
159
|
+
const char *field = fuzq->field;
|
160
|
+
const int field_num = fis_get_field_num(ir->fis, field);
|
161
|
+
|
162
|
+
if (field_num < 0) {
|
163
|
+
q = bq_new(true);
|
206
164
|
}
|
207
|
-
|
208
|
-
|
209
|
-
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
|
210
|
-
fuzq->text = fuzq->term->text + pre_len;
|
211
|
-
fuzq->text_len = (int)strlen(fuzq->text);
|
212
|
-
fuzq_initialize_max_distances(fuzq);
|
213
|
-
|
214
|
-
if (te) {
|
215
|
-
TermBuffer *tb = te->tb_curr;
|
216
|
-
float score = 0.0, min_score = fuzq->min_sim;
|
217
|
-
|
218
|
-
TRY
|
219
|
-
do {
|
220
|
-
if (strcmp(tb->field, field) != 0 ||
|
221
|
-
(prefix && strncmp(tb->text, prefix, pre_len) != 0))
|
222
|
-
break;
|
223
|
-
|
224
|
-
score = fuzq_score(fuzq, tb->text + pre_len);
|
225
|
-
//printf("%s:%s:%f\n", tb->text, fuzq->text, score);
|
226
|
-
|
227
|
-
if (score > min_score) {
|
228
|
-
pq_insert(term_pq, scored_term_create(tb_get_term(tb), score));
|
229
|
-
if (pq_full(term_pq))
|
230
|
-
min_score = ((ScoredTerm *)pq_top(term_pq))->score;
|
231
|
-
}
|
232
|
-
} while ((tb = te->next(te)) != NULL);
|
233
|
-
XFINALLY
|
234
|
-
te->close(te);
|
235
|
-
XENDTRY
|
165
|
+
else if (fuzq->pre_len >= (int)strlen(term)) {
|
166
|
+
q = tq_new(field, term);
|
236
167
|
}
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
168
|
+
else {
|
169
|
+
TermEnum *te;
|
170
|
+
char *prefix = NULL;
|
171
|
+
int pre_len = fuzq->pre_len;
|
172
|
+
|
173
|
+
q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
|
174
|
+
|
175
|
+
if (pre_len > 0) {
|
176
|
+
prefix = ALLOC_N(char, pre_len + 1);
|
177
|
+
strncpy(prefix, term, pre_len);
|
178
|
+
prefix[pre_len] = '\0';
|
179
|
+
te = ir->terms_from(ir, field_num, prefix);
|
180
|
+
}
|
181
|
+
else {
|
182
|
+
te = ir->terms(ir, field_num);
|
183
|
+
}
|
184
|
+
|
185
|
+
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
|
186
|
+
fuzq->text = term + pre_len;
|
187
|
+
fuzq->text_len = (int)strlen(fuzq->text);
|
188
|
+
fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
|
189
|
+
fuzq_initialize_max_distances(fuzq);
|
190
|
+
|
191
|
+
if (te) {
|
192
|
+
const char *curr_term = te->curr_term;
|
193
|
+
const char *curr_suffix = curr_term + pre_len;
|
194
|
+
float score = 0.0;
|
195
|
+
|
196
|
+
|
197
|
+
do {
|
198
|
+
if ((prefix && strncmp(curr_term, prefix, pre_len) != 0)) {
|
199
|
+
break;
|
200
|
+
}
|
201
|
+
|
202
|
+
score = fuzq_score(fuzq, curr_suffix);
|
203
|
+
/*
|
204
|
+
printf("%s:%s:%f < %f\n", curr_term, term, score, min_score);
|
205
|
+
*/
|
206
|
+
multi_tq_add_term_boost(q, curr_term, score);
|
207
|
+
|
208
|
+
} while (te->next(te) != NULL);
|
209
|
+
|
210
|
+
te->close(te);
|
211
|
+
}
|
212
|
+
free(prefix);
|
244
213
|
}
|
245
|
-
pq_destroy(term_pq);
|
246
|
-
}
|
247
214
|
|
248
|
-
|
215
|
+
return q;
|
249
216
|
}
|
250
217
|
|
251
|
-
void fuzq_destroy(Query *self)
|
218
|
+
static void fuzq_destroy(Query *self)
|
252
219
|
{
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
q_destroy_i(self);
|
220
|
+
free(FzQ(self)->term);
|
221
|
+
free(FzQ(self)->field);
|
222
|
+
free(FzQ(self)->da);
|
223
|
+
q_destroy_i(self);
|
258
224
|
}
|
259
225
|
|
260
|
-
static
|
226
|
+
static ulong fuzq_hash(Query *self)
|
261
227
|
{
|
262
|
-
|
263
|
-
|
228
|
+
return str_hash(FzQ(self)->term) ^ str_hash(FzQ(self)->field)
|
229
|
+
^ float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
|
264
230
|
}
|
265
231
|
|
266
232
|
static int fuzq_eq(Query *self, Query *o)
|
267
233
|
{
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
(fq1->
|
272
|
-
|
234
|
+
FuzzyQuery *fq1 = FzQ(self);
|
235
|
+
FuzzyQuery *fq2 = FzQ(o);
|
236
|
+
|
237
|
+
return (strcmp(fq1->term, fq2->term) == 0)
|
238
|
+
&& (strcmp(fq1->field, fq2->field) == 0)
|
239
|
+
&& (fq1->pre_len == fq2->pre_len)
|
240
|
+
&& (fq1->min_sim == fq2->min_sim);
|
273
241
|
}
|
274
242
|
|
275
|
-
Query *
|
243
|
+
Query *fuzq_new_conf(const char *field, const char *term,
|
244
|
+
float min_sim, int pre_len, int max_terms)
|
276
245
|
{
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
return self;
|
246
|
+
Query *self = q_new(FuzzyQuery);
|
247
|
+
|
248
|
+
FzQ(self)->field = estrdup(field);
|
249
|
+
FzQ(self)->term = estrdup(term);
|
250
|
+
FzQ(self)->pre_len = pre_len ? pre_len : DEF_PRE_LEN;
|
251
|
+
FzQ(self)->min_sim = min_sim ? min_sim : DEF_MIN_SIM;
|
252
|
+
MTQMaxTerms(self) = max_terms ? max_terms : DEF_MAX_TERMS;
|
253
|
+
|
254
|
+
self->type = FUZZY_QUERY;
|
255
|
+
self->to_s = &fuzq_to_s;
|
256
|
+
self->hash = &fuzq_hash;
|
257
|
+
self->eq = &fuzq_eq;
|
258
|
+
self->rewrite = &fuzq_rewrite;
|
259
|
+
self->destroy_i = &fuzq_destroy;
|
260
|
+
self->create_weight_i = &q_create_weight_unsup;
|
261
|
+
|
262
|
+
return self;
|
295
263
|
}
|
296
264
|
|
297
|
-
Query *
|
265
|
+
Query *fuzq_new(const char *field, const char *term)
|
298
266
|
{
|
299
|
-
|
300
|
-
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
301
|
-
if (pre_len) fuzq->pre_len = pre_len;
|
302
|
-
if (min_sim) fuzq->min_sim = min_sim;
|
303
|
-
return self;
|
267
|
+
return fuzq_new_conf(term, field, 0.0f, 0, 0);
|
304
268
|
}
|