ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/analysis.h
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
#ifndef FRT_ANALYSIS_H
|
2
2
|
#define FRT_ANALYSIS_H
|
3
3
|
|
4
|
-
#include
|
5
|
-
#include
|
4
|
+
#include "global.h"
|
5
|
+
#include "hash.h"
|
6
|
+
#include <wchar.h>
|
6
7
|
|
7
8
|
/****************************************************************************
|
8
9
|
*
|
@@ -10,19 +11,23 @@
|
|
10
11
|
*
|
11
12
|
****************************************************************************/
|
12
13
|
|
13
|
-
typedef struct Token
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
typedef struct Token
|
15
|
+
{
|
16
|
+
char text[MAX_WORD_SIZE];
|
17
|
+
int len;
|
18
|
+
int start;
|
19
|
+
int end;
|
20
|
+
int pos_inc;
|
18
21
|
} Token;
|
19
22
|
|
20
|
-
Token *
|
21
|
-
void tk_destroy(void *p);
|
22
|
-
Token *tk_set(Token *tk, char *text, int tlen, int start, int end,
|
23
|
-
|
24
|
-
|
25
|
-
int
|
23
|
+
extern Token *tk_new();
|
24
|
+
extern void tk_destroy(void *p);
|
25
|
+
extern Token *tk_set(Token *tk, char *text, int tlen, int start, int end,
|
26
|
+
int pos_inc);
|
27
|
+
extern Token *tk_set_no_len(Token *tk, char *text, int start, int end,
|
28
|
+
int pos_inc);
|
29
|
+
extern int tk_eq(Token *tk1, Token *tk2);
|
30
|
+
extern int tk_cmp(Token *tk1, Token *tk2);
|
26
31
|
|
27
32
|
/****************************************************************************
|
28
33
|
*
|
@@ -32,34 +37,82 @@ int tk_cmp(Token *tk1, Token *tk2);
|
|
32
37
|
|
33
38
|
|
34
39
|
typedef struct TokenStream TokenStream;
|
35
|
-
struct TokenStream
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
TokenStream *sub_ts; /* used by filters */
|
45
|
-
int ref_cnt;
|
40
|
+
struct TokenStream
|
41
|
+
{
|
42
|
+
char *t; /* ptr used to scan text */
|
43
|
+
char *text;
|
44
|
+
Token *(*next)(TokenStream *ts);
|
45
|
+
TokenStream *(*reset)(TokenStream *ts, char *text);
|
46
|
+
TokenStream *(*clone_i)(TokenStream *ts);
|
47
|
+
void (*destroy_i)(TokenStream *ts);
|
48
|
+
int ref_cnt;
|
46
49
|
};
|
47
50
|
|
51
|
+
#define ts_new(type) ts_new_i(sizeof(type))
|
52
|
+
extern TokenStream *ts_new_i(size_t size);
|
53
|
+
extern TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size);
|
54
|
+
|
55
|
+
typedef struct CachedTokenStream
|
56
|
+
{
|
57
|
+
TokenStream super;
|
58
|
+
Token token;
|
59
|
+
} CachedTokenStream;
|
60
|
+
|
61
|
+
typedef struct MultiByteTokenStream
|
62
|
+
{
|
63
|
+
CachedTokenStream super;
|
64
|
+
mbstate_t state;
|
65
|
+
} MultiByteTokenStream;
|
66
|
+
|
67
|
+
typedef struct StandardTokenizer
|
68
|
+
{
|
69
|
+
CachedTokenStream super;
|
70
|
+
bool (*advance_to_start)(TokenStream *ts);
|
71
|
+
bool (*is_tok_char)(char *c);
|
72
|
+
int (*get_alpha)(TokenStream *ts, char *token);
|
73
|
+
int (*get_apostrophe)(char *input);
|
74
|
+
} StandardTokenizer;
|
75
|
+
|
76
|
+
typedef struct TokenFilter
|
77
|
+
{
|
78
|
+
TokenStream super;
|
79
|
+
TokenStream *sub_ts;
|
80
|
+
} TokenFilter;
|
81
|
+
|
82
|
+
extern TokenStream *filter_clone_size(TokenStream *ts, size_t size);
|
83
|
+
#define tf_new(type, sub) tf_new_i(sizeof(type), sub)
|
84
|
+
extern TokenStream *tf_new_i(size_t size, TokenStream *sub_ts);
|
85
|
+
|
86
|
+
typedef struct StopFilter
|
87
|
+
{
|
88
|
+
TokenFilter super;
|
89
|
+
HashTable *words;
|
90
|
+
} StopFilter;
|
91
|
+
|
92
|
+
typedef struct StemFilter
|
93
|
+
{
|
94
|
+
TokenFilter super;
|
95
|
+
struct sb_stemmer *stemmer;
|
96
|
+
char *algorithm;
|
97
|
+
char *charenc;
|
98
|
+
} StemFilter;
|
99
|
+
|
48
100
|
#define ts_next(mts) mts->next(mts)
|
101
|
+
#define ts_clone(mts) mts->clone_i(mts)
|
49
102
|
|
50
|
-
void ts_deref(
|
103
|
+
extern void ts_deref(TokenStream *ts);
|
51
104
|
|
52
|
-
TokenStream *
|
53
|
-
TokenStream *
|
105
|
+
extern TokenStream *whitespace_tokenizer_new();
|
106
|
+
extern TokenStream *mb_whitespace_tokenizer_new(bool lowercase);
|
54
107
|
|
55
|
-
TokenStream *
|
56
|
-
TokenStream *
|
108
|
+
extern TokenStream *letter_tokenizer_new();
|
109
|
+
extern TokenStream *mb_letter_tokenizer_new(bool lowercase);
|
57
110
|
|
58
|
-
TokenStream *
|
59
|
-
TokenStream *
|
111
|
+
extern TokenStream *standard_tokenizer_new();
|
112
|
+
extern TokenStream *mb_standard_tokenizer_new();
|
60
113
|
|
61
|
-
TokenStream *
|
62
|
-
TokenStream *
|
114
|
+
extern TokenStream *lowercase_filter_new(TokenStream *ts);
|
115
|
+
extern TokenStream *mb_lowercase_filter_new(TokenStream *ts);
|
63
116
|
|
64
117
|
extern const char *ENGLISH_STOP_WORDS[];
|
65
118
|
extern const char *FULL_ENGLISH_STOP_WORDS[];
|
@@ -76,13 +129,13 @@ extern const char *FULL_DANISH_STOP_WORDS[];
|
|
76
129
|
extern const char *FULL_RUSSIAN_STOP_WORDS[];
|
77
130
|
extern const char *FULL_FINNISH_STOP_WORDS[];
|
78
131
|
|
79
|
-
TokenStream *
|
80
|
-
|
81
|
-
TokenStream *
|
82
|
-
|
83
|
-
TokenStream *
|
84
|
-
|
85
|
-
|
132
|
+
extern TokenStream *stop_filter_new_with_words_len(TokenStream *ts,
|
133
|
+
const char **words, int len);
|
134
|
+
extern TokenStream *stop_filter_new_with_words(TokenStream *ts,
|
135
|
+
const char **words);
|
136
|
+
extern TokenStream *stop_filter_new(TokenStream *ts);
|
137
|
+
extern TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
|
138
|
+
const char *charenc);
|
86
139
|
|
87
140
|
/****************************************************************************
|
88
141
|
*
|
@@ -90,47 +143,51 @@ TokenStream *ts_clone(TokenStream *orig_ts);
|
|
90
143
|
*
|
91
144
|
****************************************************************************/
|
92
145
|
|
93
|
-
typedef struct Analyzer
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
146
|
+
typedef struct Analyzer
|
147
|
+
{
|
148
|
+
TokenStream *current_ts;
|
149
|
+
TokenStream *(*get_ts)(struct Analyzer *a, char *field, char *text);
|
150
|
+
void (*destroy_i)(struct Analyzer *a);
|
151
|
+
int ref_cnt;
|
99
152
|
} Analyzer;
|
100
153
|
|
101
|
-
void a_deref(
|
154
|
+
extern void a_deref(Analyzer *a);
|
102
155
|
|
103
156
|
#define a_get_ts(ma, field, text) ma->get_ts(ma, field, text)
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
Analyzer *
|
111
|
-
Analyzer *
|
112
|
-
|
113
|
-
|
114
|
-
Analyzer *
|
115
|
-
|
116
|
-
|
117
|
-
Analyzer *
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
157
|
+
|
158
|
+
extern Analyzer *analyzer_new(TokenStream *ts,
|
159
|
+
void (*destroy)(Analyzer *a),
|
160
|
+
TokenStream *(*get_ts)(Analyzer *a,
|
161
|
+
char *field,
|
162
|
+
char *text));
|
163
|
+
extern void a_standard_destroy(Analyzer *a);
|
164
|
+
extern Analyzer *whitespace_analyzer_new(bool lowercase);
|
165
|
+
extern Analyzer *mb_whitespace_analyzer_new(bool lowercase);
|
166
|
+
|
167
|
+
extern Analyzer *letter_analyzer_new(bool lowercase);
|
168
|
+
extern Analyzer *mb_letter_analyzer_new(bool lowercase);
|
169
|
+
|
170
|
+
extern Analyzer *standard_analyzer_new(bool lowercase);
|
171
|
+
extern Analyzer *mb_standard_analyzer_new(bool lowercase);
|
172
|
+
|
173
|
+
extern Analyzer *standard_analyzer_new_with_words(const char **words,
|
174
|
+
bool lowercase);
|
175
|
+
extern Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
|
176
|
+
bool lowercase);
|
177
|
+
extern Analyzer *mb_standard_analyzer_new_with_words(const char **words,
|
178
|
+
bool lowercase);
|
179
|
+
extern Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
|
180
|
+
int len, bool lowercase);
|
181
|
+
|
182
|
+
#define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
|
183
|
+
typedef struct PerFieldAnalyzer
|
184
|
+
{
|
185
|
+
Analyzer super;
|
186
|
+
HashTable *dict;
|
187
|
+
Analyzer *default_a;
|
131
188
|
} PerFieldAnalyzer;
|
132
189
|
|
133
|
-
Analyzer *
|
134
|
-
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer);
|
190
|
+
extern Analyzer *per_field_analyzer_new(Analyzer *a);
|
191
|
+
extern void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer);
|
135
192
|
|
136
193
|
#endif
|
data/ext/array.c
CHANGED
@@ -1,85 +1,123 @@
|
|
1
|
-
#include
|
2
|
-
#include
|
1
|
+
#include "array.h"
|
2
|
+
#include "global.h"
|
3
3
|
#include <string.h>
|
4
4
|
|
5
|
-
|
5
|
+
#define DATA_SZ sizeof(int) * 3
|
6
|
+
|
7
|
+
void **ary_new_i(int type_size, int init_capa)
|
8
|
+
{
|
9
|
+
int *ary;
|
10
|
+
if (init_capa <= 0) {
|
11
|
+
init_capa = ARY_INIT_CAPA;
|
12
|
+
}
|
13
|
+
ary = ((int *)ecalloc(DATA_SZ + init_capa * type_size));
|
14
|
+
ary[0] = type_size;
|
15
|
+
ary[1] = init_capa;
|
16
|
+
return (void **)&ary[3];
|
17
|
+
}
|
18
|
+
|
19
|
+
inline void ary_resize_i(void ***ary, int size)
|
6
20
|
{
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
ary->allocated = allocate;
|
16
|
-
ary->free_elem = free_elem;
|
21
|
+
size++;
|
22
|
+
if (size >= ary_sz(*ary)) {
|
23
|
+
int capa = ary_capa(*ary);
|
24
|
+
if (size >= capa) {
|
25
|
+
int *ary_start = &((int *)*ary)[-3];
|
26
|
+
while (size >= capa) {
|
27
|
+
capa <<= 1;
|
28
|
+
}
|
17
29
|
|
18
|
-
|
30
|
+
ary_start = (int *)erealloc(ary_start,
|
31
|
+
DATA_SZ + capa * ary_type_size(*ary));
|
32
|
+
*ary = (void **)&(ary_start[3]);
|
33
|
+
memset(((char *)*ary) + ary_type_size(*ary) * ary_sz(*ary), 0,
|
34
|
+
(capa - ary_sz(*ary)) * ary_type_size(*ary));
|
35
|
+
ary_capa(*ary) = capa;
|
36
|
+
}
|
37
|
+
ary_sz(*ary) = size;
|
38
|
+
}
|
19
39
|
}
|
20
40
|
|
21
|
-
void
|
41
|
+
void ary_set_i(void ***ary, int index, void *value)
|
22
42
|
{
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
43
|
+
if (index < 0) {
|
44
|
+
index += ary_sz(*ary);
|
45
|
+
if (index < 0) {
|
46
|
+
RAISE(INDEX_ERROR, "index %d out array", index);
|
47
|
+
}
|
28
48
|
}
|
29
|
-
|
30
|
-
|
31
|
-
free(ary);
|
49
|
+
ary_resize_i(ary, index);
|
50
|
+
(*ary)[index] = value;
|
32
51
|
}
|
33
52
|
|
34
|
-
void
|
53
|
+
void *ary_get_i(void **ary, int index)
|
35
54
|
{
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
55
|
+
if (index < 0) {
|
56
|
+
index += ary_sz(ary);
|
57
|
+
}
|
58
|
+
if (index >= 0 && index < ary_sz(ary)) {
|
59
|
+
return ary[index];
|
60
|
+
}
|
61
|
+
else {
|
62
|
+
return NULL;
|
63
|
+
}
|
64
|
+
}
|
41
65
|
|
42
|
-
|
43
|
-
|
66
|
+
void ary_push_i(void ***ary, void *value)
|
67
|
+
{
|
68
|
+
int size = ary_sz(*ary);
|
69
|
+
ary_resize_i(ary, size);
|
70
|
+
(*ary)[size] = value;
|
71
|
+
}
|
44
72
|
|
45
|
-
|
46
|
-
|
73
|
+
void *ary_pop_i(void **ary)
|
74
|
+
{
|
75
|
+
void *val = ary[--ary_sz(ary)];
|
76
|
+
ary[ary_sz(ary)] = NULL;
|
77
|
+
return val;
|
78
|
+
}
|
47
79
|
|
48
|
-
|
80
|
+
void ary_unshift_i(void ***ary, void *value)
|
81
|
+
{
|
82
|
+
int size = ary_sz(*ary);
|
83
|
+
ary_resize_i(ary, size);
|
84
|
+
memmove(*ary, *ary + 1, size * sizeof(void *));
|
85
|
+
(*ary)[0] = value;
|
49
86
|
}
|
50
87
|
|
51
|
-
void
|
88
|
+
void *ary_shift_i(void **ary)
|
52
89
|
{
|
53
|
-
|
90
|
+
void *val = ary[0];
|
91
|
+
int size = --ary_sz(ary);
|
92
|
+
memmove(ary, ary + 1, size * sizeof(void *));
|
93
|
+
ary[size] = NULL;
|
94
|
+
return val;
|
54
95
|
}
|
55
96
|
|
56
|
-
void *
|
97
|
+
void *ary_remove_i(void **ary, int index)
|
57
98
|
{
|
58
|
-
|
59
|
-
|
60
|
-
|
99
|
+
if (index >= 0 && index < ary_sz(ary)) {
|
100
|
+
void *val = ary[index];
|
101
|
+
memmove(ary + index, ary + index + 1,
|
102
|
+
(ary_sz(ary) - index + 1) * sizeof(void *));
|
103
|
+
ary_sz(ary)--;
|
104
|
+
return val;
|
105
|
+
}
|
106
|
+
else {
|
107
|
+
return NULL;
|
108
|
+
}
|
61
109
|
}
|
62
110
|
|
63
|
-
void
|
111
|
+
void ary_delete_i(void **ary, int index, void (*free_elem)(void *p))
|
64
112
|
{
|
65
|
-
|
66
|
-
return;
|
67
|
-
if (ary->free_elem && ary->elems[index])
|
68
|
-
ary->free_elem(ary->elems[index]);
|
69
|
-
ary->elems[index] = NULL;
|
70
|
-
if (index == ary->size - 1)
|
71
|
-
ary->size--;
|
113
|
+
free_elem(ary_remove(ary, index));
|
72
114
|
}
|
73
115
|
|
74
|
-
void
|
116
|
+
void ary_destroy_i(void **ary, void (*free_elem)(void *p))
|
75
117
|
{
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
ary->size--;
|
82
|
-
memmove(&ary->elems[index], &ary->elems[index + 1],
|
83
|
-
sizeof(void *) *(ary->size - index));
|
84
|
-
return p;
|
118
|
+
int i;
|
119
|
+
for (i = ary_sz(ary) - 1; i >= 0; i--) {
|
120
|
+
free_elem(ary[i]);
|
121
|
+
}
|
122
|
+
ary_free(ary);
|
85
123
|
}
|
data/ext/array.h
CHANGED
@@ -1,19 +1,46 @@
|
|
1
1
|
#ifndef FRT_ARRAY_H
|
2
2
|
#define FRT_ARRAY_H
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
#define ARY_INIT_CAPA 8
|
5
|
+
#define ary_size(ary) ary_sz(ary)
|
6
|
+
#define ary_sz(ary) (((int *)ary)[-1])
|
7
|
+
#define ary_capa(ary) (((int *)ary)[-2])
|
8
|
+
#define ary_type_size(ary) (((int *)ary)[-3])
|
9
|
+
#define ary_start(ary) ((void **)&(((int *)ary)[-3]))
|
10
|
+
#define ary_free(ary) free(ary_start(ary))
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
void
|
17
|
-
void
|
12
|
+
#define ary_new_type_capa(type, init_capa)\
|
13
|
+
(type *)ary_new_i(sizeof(type), init_capa)
|
14
|
+
#define ary_new_type(type) (type *)ary_new_i(sizeof(type), 0)
|
15
|
+
#define ary_new_capa(init_capa) ary_new_i(sizeof(void *), init_capa)
|
16
|
+
#define ary_new() ary_new_i(sizeof(void *), 0)
|
17
|
+
#define ary_resize(ary, size) ary_resize_i(((void ***)(void *)&ary), size)
|
18
|
+
#define ary_set(ary, i, val) ary_set_i(((void ***)(void *)&ary), i, val)
|
19
|
+
#define ary_get(ary, i) ary_get_i(((void **)ary), i)
|
20
|
+
#define ary_push(ary, val) ary_push_i(((void ***)(void *)&ary), val)
|
21
|
+
#define ary_pop(ary) ary_pop_i(((void **)ary))
|
22
|
+
#define ary_unshift(ary, val) ary_unshift_i(((void ***)(void *)&ary), val)
|
23
|
+
#define ary_shift(ary) ary_shift_i(((void **)ary))
|
24
|
+
#define ary_remove(ary, i) ary_remove_i(((void **)ary), i)
|
25
|
+
#define ary_delete(ary, i, f) ary_delete_i(((void **)ary), i, (free_ft)f)
|
26
|
+
#define ary_destroy(ary, f) ary_destroy_i(((void **)ary), (free_ft)f)
|
27
|
+
#define ary_rsz(ary, size) ary_resize(ary, size)
|
28
|
+
#define ary_grow(ary) ary_resize(ary, ary_sz(ary))
|
29
|
+
#define ary_last(ary) ary[ary_sz(ary) - 1]
|
30
|
+
#define ary_sort(ary, cmp) qsort(ary, ary_size(ary), ary_type_size(ary), cmp)
|
31
|
+
#define ary_each_rev(ary, i) for (i = ary_size(ary) - 1; i >= 0; i--)
|
32
|
+
#define ary_each(ary, i) for (i = 0; i < ary_size(ary); i++)
|
33
|
+
|
34
|
+
extern void ary_resize_i(void ***ary, int size);
|
35
|
+
extern void **ary_new_i(int type_size, int init_capa);
|
36
|
+
extern void ary_set_i(void ***ary, int index, void *value);
|
37
|
+
extern void *ary_get_i(void **ary, int index);
|
38
|
+
extern void ary_push_i(void ***ary, void *value);
|
39
|
+
extern void *ary_pop_i(void **ary);
|
40
|
+
extern void ary_unshift_i(void ***ary, void *value);
|
41
|
+
extern void *ary_shift_i(void **ary);
|
42
|
+
extern void *ary_remove_i(void **ary, int index);
|
43
|
+
extern void ary_delete_i(void **ary, int index, void (*free_elem)(void *p));
|
44
|
+
extern void ary_destroy_i(void **ary, void (*free_elem)(void *p));
|
18
45
|
|
19
46
|
#endif
|