ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/analysis.h CHANGED
@@ -1,8 +1,9 @@
1
1
  #ifndef FRT_ANALYSIS_H
2
2
  #define FRT_ANALYSIS_H
3
3
 
4
- #include <global.h>
5
- #include <hash.h>
4
+ #include "global.h"
5
+ #include "hash.h"
6
+ #include <wchar.h>
6
7
 
7
8
  /****************************************************************************
8
9
  *
@@ -10,19 +11,23 @@
10
11
  *
11
12
  ****************************************************************************/
12
13
 
13
- typedef struct Token {
14
- char text[MAX_WORD_SIZE];
15
- int start;
16
- int end;
17
- int pos_inc;
14
+ typedef struct Token
15
+ {
16
+ char text[MAX_WORD_SIZE];
17
+ int len;
18
+ int start;
19
+ int end;
20
+ int pos_inc;
18
21
  } Token;
19
22
 
20
- Token *tk_create();
21
- void tk_destroy(void *p);
22
- Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc);
23
- Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc);
24
- int tk_eq(Token *tk1, Token *tk2);
25
- int tk_cmp(Token *tk1, Token *tk2);
23
+ extern Token *tk_new();
24
+ extern void tk_destroy(void *p);
25
+ extern Token *tk_set(Token *tk, char *text, int tlen, int start, int end,
26
+ int pos_inc);
27
+ extern Token *tk_set_no_len(Token *tk, char *text, int start, int end,
28
+ int pos_inc);
29
+ extern int tk_eq(Token *tk1, Token *tk2);
30
+ extern int tk_cmp(Token *tk1, Token *tk2);
26
31
 
27
32
  /****************************************************************************
28
33
  *
@@ -32,34 +37,82 @@ int tk_cmp(Token *tk1, Token *tk2);
32
37
 
33
38
 
34
39
  typedef struct TokenStream TokenStream;
35
- struct TokenStream {
36
- void *data;
37
- char *text;
38
- char *t; /* ptr used to scan text */
39
- Token *token;
40
- Token *(*next)(TokenStream *ts);
41
- void (*reset)(TokenStream *ts, char *text);
42
- void (*clone_i)(TokenStream *orig_ts, TokenStream *new_ts);
43
- void (*destroy)(TokenStream *ts);
44
- TokenStream *sub_ts; /* used by filters */
45
- int ref_cnt;
40
+ struct TokenStream
41
+ {
42
+ char *t; /* ptr used to scan text */
43
+ char *text;
44
+ Token *(*next)(TokenStream *ts);
45
+ TokenStream *(*reset)(TokenStream *ts, char *text);
46
+ TokenStream *(*clone_i)(TokenStream *ts);
47
+ void (*destroy_i)(TokenStream *ts);
48
+ int ref_cnt;
46
49
  };
47
50
 
51
+ #define ts_new(type) ts_new_i(sizeof(type))
52
+ extern TokenStream *ts_new_i(size_t size);
53
+ extern TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size);
54
+
55
+ typedef struct CachedTokenStream
56
+ {
57
+ TokenStream super;
58
+ Token token;
59
+ } CachedTokenStream;
60
+
61
+ typedef struct MultiByteTokenStream
62
+ {
63
+ CachedTokenStream super;
64
+ mbstate_t state;
65
+ } MultiByteTokenStream;
66
+
67
+ typedef struct StandardTokenizer
68
+ {
69
+ CachedTokenStream super;
70
+ bool (*advance_to_start)(TokenStream *ts);
71
+ bool (*is_tok_char)(char *c);
72
+ int (*get_alpha)(TokenStream *ts, char *token);
73
+ int (*get_apostrophe)(char *input);
74
+ } StandardTokenizer;
75
+
76
+ typedef struct TokenFilter
77
+ {
78
+ TokenStream super;
79
+ TokenStream *sub_ts;
80
+ } TokenFilter;
81
+
82
+ extern TokenStream *filter_clone_size(TokenStream *ts, size_t size);
83
+ #define tf_new(type, sub) tf_new_i(sizeof(type), sub)
84
+ extern TokenStream *tf_new_i(size_t size, TokenStream *sub_ts);
85
+
86
+ typedef struct StopFilter
87
+ {
88
+ TokenFilter super;
89
+ HashTable *words;
90
+ } StopFilter;
91
+
92
+ typedef struct StemFilter
93
+ {
94
+ TokenFilter super;
95
+ struct sb_stemmer *stemmer;
96
+ char *algorithm;
97
+ char *charenc;
98
+ } StemFilter;
99
+
48
100
  #define ts_next(mts) mts->next(mts)
101
+ #define ts_clone(mts) mts->clone_i(mts)
49
102
 
50
- void ts_deref(void *p);
103
+ extern void ts_deref(TokenStream *ts);
51
104
 
52
- TokenStream *whitespace_tokenizer_create();
53
- TokenStream *mb_whitespace_tokenizer_create(bool lowercase);
105
+ extern TokenStream *whitespace_tokenizer_new();
106
+ extern TokenStream *mb_whitespace_tokenizer_new(bool lowercase);
54
107
 
55
- TokenStream *letter_tokenizer_create();
56
- TokenStream *mb_letter_tokenizer_create(bool lowercase);
108
+ extern TokenStream *letter_tokenizer_new();
109
+ extern TokenStream *mb_letter_tokenizer_new(bool lowercase);
57
110
 
58
- TokenStream *standard_tokenizer_create();
59
- TokenStream *mb_standard_tokenizer_create();
111
+ extern TokenStream *standard_tokenizer_new();
112
+ extern TokenStream *mb_standard_tokenizer_new();
60
113
 
61
- TokenStream *lowercase_filter_create(TokenStream *ts);
62
- TokenStream *mb_lowercase_filter_create(TokenStream *ts);
114
+ extern TokenStream *lowercase_filter_new(TokenStream *ts);
115
+ extern TokenStream *mb_lowercase_filter_new(TokenStream *ts);
63
116
 
64
117
  extern const char *ENGLISH_STOP_WORDS[];
65
118
  extern const char *FULL_ENGLISH_STOP_WORDS[];
@@ -76,13 +129,13 @@ extern const char *FULL_DANISH_STOP_WORDS[];
76
129
  extern const char *FULL_RUSSIAN_STOP_WORDS[];
77
130
  extern const char *FULL_FINNISH_STOP_WORDS[];
78
131
 
79
- TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
80
- const char **words, int len);
81
- TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words);
82
- TokenStream *stop_filter_create(TokenStream *ts);
83
- TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
84
- const char * charenc);
85
- TokenStream *ts_clone(TokenStream *orig_ts);
132
+ extern TokenStream *stop_filter_new_with_words_len(TokenStream *ts,
133
+ const char **words, int len);
134
+ extern TokenStream *stop_filter_new_with_words(TokenStream *ts,
135
+ const char **words);
136
+ extern TokenStream *stop_filter_new(TokenStream *ts);
137
+ extern TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
138
+ const char *charenc);
86
139
 
87
140
  /****************************************************************************
88
141
  *
@@ -90,47 +143,51 @@ TokenStream *ts_clone(TokenStream *orig_ts);
90
143
  *
91
144
  ****************************************************************************/
92
145
 
93
- typedef struct Analyzer {
94
- void *data;
95
- TokenStream *current_ts;
96
- TokenStream *(*get_ts)(struct Analyzer *a, char *field, char *text);
97
- void (*destroy)(struct Analyzer *a);
98
- int ref_cnt;
146
+ typedef struct Analyzer
147
+ {
148
+ TokenStream *current_ts;
149
+ TokenStream *(*get_ts)(struct Analyzer *a, char *field, char *text);
150
+ void (*destroy_i)(struct Analyzer *a);
151
+ int ref_cnt;
99
152
  } Analyzer;
100
153
 
101
- void a_deref(void *p);
154
+ extern void a_deref(Analyzer *a);
102
155
 
103
156
  #define a_get_ts(ma, field, text) ma->get_ts(ma, field, text)
104
- #define a_get_new_ts(ma, field, text) ts_clone(ma->get_ts(ma, field, text))
105
-
106
- Analyzer *analyzer_create(void *data, TokenStream *ts,
107
- void (*destroy)(Analyzer *),
108
- TokenStream *(*get_ts)(Analyzer *a, char *field, char *text));
109
- void a_standard_destroy(Analyzer *a);
110
- Analyzer *whitespace_analyzer_create(bool lowercase);
111
- Analyzer *mb_whitespace_analyzer_create(bool lowercase);
112
-
113
- Analyzer *letter_analyzer_create(bool lowercase);
114
- Analyzer *mb_letter_analyzer_create(bool lowercase);
115
-
116
- Analyzer *standard_analyzer_create(bool lowercase);
117
- Analyzer *mb_standard_analyzer_create(bool lowercase);
118
-
119
- Analyzer *standard_analyzer_create_with_words(
120
- const char **words, bool lowercase);
121
- Analyzer *standard_analyzer_create_with_words_len(
122
- const char **words, int len, bool lowercase);
123
- Analyzer *mb_standard_analyzer_create_with_words(
124
- const char **words, bool lowercase);
125
- Analyzer *mb_standard_analyzer_create_with_words_len(
126
- const char **words, int len, bool lowercase);
127
-
128
- typedef struct PerFieldAnalyzer {
129
- HshTable *dict;
130
- Analyzer *def;
157
+
158
+ extern Analyzer *analyzer_new(TokenStream *ts,
159
+ void (*destroy)(Analyzer *a),
160
+ TokenStream *(*get_ts)(Analyzer *a,
161
+ char *field,
162
+ char *text));
163
+ extern void a_standard_destroy(Analyzer *a);
164
+ extern Analyzer *whitespace_analyzer_new(bool lowercase);
165
+ extern Analyzer *mb_whitespace_analyzer_new(bool lowercase);
166
+
167
+ extern Analyzer *letter_analyzer_new(bool lowercase);
168
+ extern Analyzer *mb_letter_analyzer_new(bool lowercase);
169
+
170
+ extern Analyzer *standard_analyzer_new(bool lowercase);
171
+ extern Analyzer *mb_standard_analyzer_new(bool lowercase);
172
+
173
+ extern Analyzer *standard_analyzer_new_with_words(const char **words,
174
+ bool lowercase);
175
+ extern Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
176
+ bool lowercase);
177
+ extern Analyzer *mb_standard_analyzer_new_with_words(const char **words,
178
+ bool lowercase);
179
+ extern Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
180
+ int len, bool lowercase);
181
+
182
+ #define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
183
+ typedef struct PerFieldAnalyzer
184
+ {
185
+ Analyzer super;
186
+ HashTable *dict;
187
+ Analyzer *default_a;
131
188
  } PerFieldAnalyzer;
132
189
 
133
- Analyzer *per_field_analyzer_create(Analyzer *def);
134
- void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer);
190
+ extern Analyzer *per_field_analyzer_new(Analyzer *a);
191
+ extern void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer);
135
192
 
136
193
  #endif
data/ext/array.c CHANGED
@@ -1,85 +1,123 @@
1
- #include <global.h>
2
- #include <array.h>
1
+ #include "array.h"
2
+ #include "global.h"
3
3
  #include <string.h>
4
4
 
5
- Array *ary_create(int allocate, void (*free_elem)(void *p))
5
+ #define DATA_SZ sizeof(int) * 3
6
+
7
+ void **ary_new_i(int type_size, int init_capa)
8
+ {
9
+ int *ary;
10
+ if (init_capa <= 0) {
11
+ init_capa = ARY_INIT_CAPA;
12
+ }
13
+ ary = ((int *)ecalloc(DATA_SZ + init_capa * type_size));
14
+ ary[0] = type_size;
15
+ ary[1] = init_capa;
16
+ return (void **)&ary[3];
17
+ }
18
+
19
+ inline void ary_resize_i(void ***ary, int size)
6
20
  {
7
- Array *ary = ALLOC(Array);
8
- if (allocate == 0) {
9
- ary->elems = NULL;
10
- } else {
11
- ary->elems = ALLOC_N(void *, allocate);
12
- memset(ary->elems, 0, sizeof(void *) * allocate);
13
- }
14
- ary->size = 0;
15
- ary->allocated = allocate;
16
- ary->free_elem = free_elem;
21
+ size++;
22
+ if (size >= ary_sz(*ary)) {
23
+ int capa = ary_capa(*ary);
24
+ if (size >= capa) {
25
+ int *ary_start = &((int *)*ary)[-3];
26
+ while (size >= capa) {
27
+ capa <<= 1;
28
+ }
17
29
 
18
- return ary;
30
+ ary_start = (int *)erealloc(ary_start,
31
+ DATA_SZ + capa * ary_type_size(*ary));
32
+ *ary = (void **)&(ary_start[3]);
33
+ memset(((char *)*ary) + ary_type_size(*ary) * ary_sz(*ary), 0,
34
+ (capa - ary_sz(*ary)) * ary_type_size(*ary));
35
+ ary_capa(*ary) = capa;
36
+ }
37
+ ary_sz(*ary) = size;
38
+ }
19
39
  }
20
40
 
21
- void ary_destroy(Array *ary)
41
+ void ary_set_i(void ***ary, int index, void *value)
22
42
  {
23
- int i;
24
- if (ary->free_elem) {
25
- for (i = 0; i < ary->size; i++) {
26
- if (ary->elems[i])
27
- ary->free_elem(ary->elems[i]);
43
+ if (index < 0) {
44
+ index += ary_sz(*ary);
45
+ if (index < 0) {
46
+ RAISE(INDEX_ERROR, "index %d out array", index);
47
+ }
28
48
  }
29
- }
30
- free(ary->elems);
31
- free(ary);
49
+ ary_resize_i(ary, index);
50
+ (*ary)[index] = value;
32
51
  }
33
52
 
34
- void ary_set(Array *ary, int index, void *value)
53
+ void *ary_get_i(void **ary, int index)
35
54
  {
36
- if (index >= ary->allocated) {
37
- ary->allocated = (index + 1)*2;
38
- REALLOC_N(ary->elems, void *, (ary->allocated));
39
- memset((&ary->elems[ary->size]), 0, sizeof(void *) * (ary->allocated - ary->size));
40
- }
55
+ if (index < 0) {
56
+ index += ary_sz(ary);
57
+ }
58
+ if (index >= 0 && index < ary_sz(ary)) {
59
+ return ary[index];
60
+ }
61
+ else {
62
+ return NULL;
63
+ }
64
+ }
41
65
 
42
- if (index >= ary->size)
43
- ary->size = index + 1;
66
+ void ary_push_i(void ***ary, void *value)
67
+ {
68
+ int size = ary_sz(*ary);
69
+ ary_resize_i(ary, size);
70
+ (*ary)[size] = value;
71
+ }
44
72
 
45
- if (ary->free_elem && ary->elems[index])
46
- ary->free_elem(ary->elems[index]);
73
+ void *ary_pop_i(void **ary)
74
+ {
75
+ void *val = ary[--ary_sz(ary)];
76
+ ary[ary_sz(ary)] = NULL;
77
+ return val;
78
+ }
47
79
 
48
- ary->elems[index] = value;
80
+ void ary_unshift_i(void ***ary, void *value)
81
+ {
82
+ int size = ary_sz(*ary);
83
+ ary_resize_i(ary, size);
84
+ memmove(*ary, *ary + 1, size * sizeof(void *));
85
+ (*ary)[0] = value;
49
86
  }
50
87
 
51
- void ary_append(Array *ary, void *value)
88
+ void *ary_shift_i(void **ary)
52
89
  {
53
- ary_set(ary, ary->size, value);
90
+ void *val = ary[0];
91
+ int size = --ary_sz(ary);
92
+ memmove(ary, ary + 1, size * sizeof(void *));
93
+ ary[size] = NULL;
94
+ return val;
54
95
  }
55
96
 
56
- void *ary_get(Array *ary, int index)
97
+ void *ary_remove_i(void **ary, int index)
57
98
  {
58
- if (index >= ary->size)
59
- return NULL;
60
- return ary->elems[index];
99
+ if (index >= 0 && index < ary_sz(ary)) {
100
+ void *val = ary[index];
101
+ memmove(ary + index, ary + index + 1,
102
+ (ary_sz(ary) - index + 1) * sizeof(void *));
103
+ ary_sz(ary)--;
104
+ return val;
105
+ }
106
+ else {
107
+ return NULL;
108
+ }
61
109
  }
62
110
 
63
- void ary_delete(Array *ary, int index)
111
+ void ary_delete_i(void **ary, int index, void (*free_elem)(void *p))
64
112
  {
65
- if (index >= ary->size)
66
- return;
67
- if (ary->free_elem && ary->elems[index])
68
- ary->free_elem(ary->elems[index]);
69
- ary->elems[index] = NULL;
70
- if (index == ary->size - 1)
71
- ary->size--;
113
+ free_elem(ary_remove(ary, index));
72
114
  }
73
115
 
74
- void *ary_remove(Array *ary, int index)
116
+ void ary_destroy_i(void **ary, void (*free_elem)(void *p))
75
117
  {
76
- void *p;
77
- if (index >= ary->size)
78
- return NULL;
79
- p = ary->elems[index];
80
- ary->elems[index] = NULL;
81
- ary->size--;
82
- memmove(&ary->elems[index], &ary->elems[index + 1],
83
- sizeof(void *) *(ary->size - index));
84
- return p;
118
+ int i;
119
+ for (i = ary_sz(ary) - 1; i >= 0; i--) {
120
+ free_elem(ary[i]);
121
+ }
122
+ ary_free(ary);
85
123
  }
data/ext/array.h CHANGED
@@ -1,19 +1,46 @@
1
1
  #ifndef FRT_ARRAY_H
2
2
  #define FRT_ARRAY_H
3
3
 
4
- typedef struct Array {
5
- void **elems;
6
- int size;
7
- int allocated;
8
- void (*free_elem)(void *p);
9
- } Array;
4
+ #define ARY_INIT_CAPA 8
5
+ #define ary_size(ary) ary_sz(ary)
6
+ #define ary_sz(ary) (((int *)ary)[-1])
7
+ #define ary_capa(ary) (((int *)ary)[-2])
8
+ #define ary_type_size(ary) (((int *)ary)[-3])
9
+ #define ary_start(ary) ((void **)&(((int *)ary)[-3]))
10
+ #define ary_free(ary) free(ary_start(ary))
10
11
 
11
- Array *ary_create(int size, void (*free_elem)(void *p));
12
- void ary_destroy(Array *ary);
13
- void ary_set(Array *ary, int index, void *value);
14
- void ary_append(Array *ary, void *value);
15
- void *ary_get(Array *ary, int index);
16
- void ary_delete(Array *ary, int index);
17
- void *ary_remove(Array *ary, int index);
12
+ #define ary_new_type_capa(type, init_capa)\
13
+ (type *)ary_new_i(sizeof(type), init_capa)
14
+ #define ary_new_type(type) (type *)ary_new_i(sizeof(type), 0)
15
+ #define ary_new_capa(init_capa) ary_new_i(sizeof(void *), init_capa)
16
+ #define ary_new() ary_new_i(sizeof(void *), 0)
17
+ #define ary_resize(ary, size) ary_resize_i(((void ***)(void *)&ary), size)
18
+ #define ary_set(ary, i, val) ary_set_i(((void ***)(void *)&ary), i, val)
19
+ #define ary_get(ary, i) ary_get_i(((void **)ary), i)
20
+ #define ary_push(ary, val) ary_push_i(((void ***)(void *)&ary), val)
21
+ #define ary_pop(ary) ary_pop_i(((void **)ary))
22
+ #define ary_unshift(ary, val) ary_unshift_i(((void ***)(void *)&ary), val)
23
+ #define ary_shift(ary) ary_shift_i(((void **)ary))
24
+ #define ary_remove(ary, i) ary_remove_i(((void **)ary), i)
25
+ #define ary_delete(ary, i, f) ary_delete_i(((void **)ary), i, (free_ft)f)
26
+ #define ary_destroy(ary, f) ary_destroy_i(((void **)ary), (free_ft)f)
27
+ #define ary_rsz(ary, size) ary_resize(ary, size)
28
+ #define ary_grow(ary) ary_resize(ary, ary_sz(ary))
29
+ #define ary_last(ary) ary[ary_sz(ary) - 1]
30
+ #define ary_sort(ary, cmp) qsort(ary, ary_size(ary), ary_type_size(ary), cmp)
31
+ #define ary_each_rev(ary, i) for (i = ary_size(ary) - 1; i >= 0; i--)
32
+ #define ary_each(ary, i) for (i = 0; i < ary_size(ary); i++)
33
+
34
+ extern void ary_resize_i(void ***ary, int size);
35
+ extern void **ary_new_i(int type_size, int init_capa);
36
+ extern void ary_set_i(void ***ary, int index, void *value);
37
+ extern void *ary_get_i(void **ary, int index);
38
+ extern void ary_push_i(void ***ary, void *value);
39
+ extern void *ary_pop_i(void **ary);
40
+ extern void ary_unshift_i(void ***ary, void *value);
41
+ extern void *ary_shift_i(void **ary);
42
+ extern void *ary_remove_i(void **ary, int index);
43
+ extern void ary_delete_i(void **ary, int index, void (*free_elem)(void *p));
44
+ extern void ary_destroy_i(void **ary, void (*free_elem)(void *p));
18
45
 
19
46
  #endif