ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/w32_io.c DELETED
@@ -1,150 +0,0 @@
1
- #ifdef WIN32
2
-
3
- #include "global.h"
4
- #include "store.h"
5
- #include <stdio.h>
6
- #include <io.h>
7
- #include <errno.h>
8
- #include <string.h>
9
-
10
- /**
11
- * Create a filepath for a file in the store using the operating systems
12
- * default file seperator.
13
- */
14
- char *join_path(char *buf, const char *base, const char *filename)
15
- {
16
- sprintf(buf, "%s\\%s", base, filename);
17
- return buf;
18
- }
19
-
20
- bool exists(char *path)
21
- {
22
- int fd = _open(path, 0);
23
- if (fd < 0) {
24
- if (errno != ENOENT) {
25
- RAISE(IO_ERROR, strerror(errno));
26
- }
27
- return false;
28
- }
29
- _close(fd);
30
- return true;
31
- }
32
-
33
- int fcount(char *path)
34
- {
35
- char buf[MAX_FILE_PATH];
36
- struct _finddata_t fd;
37
- intptr_t d;
38
- int cnt = 0;
39
-
40
- join_path(buf, path, "*");
41
-
42
- if ((d = _findfirst(buf, &fd)) < 0) {
43
- RAISE(IO_ERROR, strerror(errno));
44
- }
45
-
46
- do {
47
- if (fd.name[0] != '.') {
48
- cnt++;
49
- }
50
- } while (_findnext(d, &fd) == 0);
51
- _findclose(d);
52
-
53
- return cnt;
54
- }
55
-
56
- void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
57
- {
58
- char buf[MAX_FILE_PATH];
59
- struct _finddata_t fd;
60
- intptr_t d;
61
- join_path(buf, path, "*");
62
-
63
- if ((d = _findfirst(buf, &fd)) < 0) {
64
- RAISE(IO_ERROR, strerror(errno));
65
- }
66
-
67
- while (_findnext(d, &fd) == 0) {
68
- if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
69
- func(fd.name, arg);
70
- }
71
- }
72
- _findclose(d);
73
- }
74
-
75
- /**
76
- * Clear all the locks in the store.
77
- *
78
- * @param store the store to clear the locks from
79
- * @throws IO_ERROR if there is an error opening the directory
80
- */
81
- void fs_clear_locks(Store *store)
82
- {
83
- char buf[MAX_FILE_PATH];
84
- struct _finddata_t fd;
85
- intptr_t d;
86
- join_path(buf, store->dir.path, "*");
87
-
88
- if ((d = _findfirst(buf, &fd)) < 0) {
89
- RAISE(IO_ERROR, strerror(errno));
90
- }
91
-
92
- while (_findnext(d, &fd) == 0) {
93
- if (file_is_lock(fd.name)) {
94
- remove(join_path(buf, store->dir.path, fd.name));
95
- }
96
- }
97
- _findclose(d);
98
- }
99
-
100
- /**
101
- * Clear all files from the store except the lock files.
102
- *
103
- * @param store the store to clear all the files from
104
- * @throws IO_ERROR if there is an error deleting the files
105
- */
106
- void fs_clear(Store *store)
107
- {
108
- char buf[MAX_FILE_PATH];
109
- struct _finddata_t fd;
110
- intptr_t d;
111
- join_path(buf, store->dir.path, "*");
112
-
113
- if ((d = _findfirst(buf, &fd)) < 0) {
114
- RAISE(IO_ERROR, strerror(errno));
115
- }
116
-
117
- while (_findnext(d, &fd) == 0) {
118
- if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
119
- remove(join_path(buf, store->dir.path, fd.name));
120
- }
121
- }
122
- _findclose(d);
123
- }
124
-
125
- /**
126
- * Clear all files from the store including the lock files.
127
- *
128
- * @param store the store to clear all the files from
129
- * @throws IO_ERROR if there is an error deleting the files
130
- */
131
- void fs_clear_all(Store *store)
132
- {
133
- char buf[MAX_FILE_PATH];
134
- struct _finddata_t fd;
135
- intptr_t d;
136
- join_path(buf, store->dir.path, "*");
137
-
138
- if ((d = _findfirst(buf, &fd)) < 0) {
139
- RAISE(IO_ERROR, strerror(errno));
140
- }
141
-
142
- while (_findnext(d, &fd) == 0) {
143
- if (fd.name[0] != '.') {
144
- remove(join_path(buf, store->dir.path, fd.name));
145
- }
146
- }
147
- _findclose(d);
148
- }
149
-
150
- #endif
@@ -1,11 +0,0 @@
1
- # Documentation for Analysis
2
- module Ferret::Analysis
3
- end
4
-
5
- require 'ferret/analysis/token'
6
- require 'ferret/analysis/token_stream'
7
- require 'ferret/analysis/tokenizers'
8
- require 'ferret/analysis/standard_tokenizer'
9
- require 'ferret/analysis/token_filters'
10
- require 'ferret/analysis/word_list_loader'
11
- require 'ferret/analysis/analyzers'
@@ -1,112 +0,0 @@
1
- module Ferret::Analysis
2
- # An Analyzer builds TokenStreams, which analyze text. It thus represents
3
- # a policy for extracting index terms from text.
4
- #
5
- # Typical implementations first build a Tokenizer, which breaks the stream
6
- # of characters from the Reader into raw Tokens. One or more TokenFilter s
7
- # may then be applied to the output of the Tokenizer.
8
- #
9
- # The default Analyzer just creates a LowerCaseTokenizer which converts
10
- # all text to lowercase tokens. See LowerCaseTokenizer for more details.
11
- class Analyzer
12
- # Creates a TokenStream which tokenizes all the text in the provided
13
- # Reader. Override to allow Analyzer to choose strategy based on
14
- # document and/or field.
15
- # string:: the string representing the text in the field
16
- # field:: name of the field. Not required.
17
- def token_stream(field, string)
18
- return LowerCaseTokenizer.new(string)
19
- end
20
-
21
- # Invoked before indexing a Field instance if
22
- # terms have already been added to that field. This allows custom
23
- # analyzers to place an automatic position increment gap between
24
- # Field instances using the same field name. The default value
25
- # position increment gap is 0. With a 0 position increment gap and
26
- # the typical default token position increment of 1, all terms in a field,
27
- # including across Field instances, are in successive positions, allowing
28
- # exact PhraseQuery matches, for instance, across Field instance boundaries.
29
- #
30
- # field_name:: Field name being indexed.
31
- # pos_inc_gap:: added to the next token emitted from
32
- # #token_stream(String,Reader)
33
- #
34
- def pos_inc_gap(field_name)
35
- return 0
36
- end
37
-
38
- end
39
-
40
- # An Analyzer that uses WhiteSpaceTokenizer.
41
- class WhiteSpaceAnalyzer < Analyzer
42
- def token_stream(field, string)
43
- return WhiteSpaceTokenizer.new(string)
44
- end
45
- end
46
-
47
- # Filters LetterTokenizer with LowerCaseFilter and StopFilter.
48
- class StopAnalyzer < Analyzer
49
-
50
- # An array containing some common English words that are not usually useful
51
- # for searching.
52
- ENGLISH_STOP_WORDS = [
53
- "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
54
- "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
55
- "t", "that", "the", "their", "then", "there", "these",
56
- "they", "this", "to", "was", "will", "with"
57
- ]
58
-
59
- # Builds an analyzer which removes words in the provided array.
60
- def initialize(stop_words = ENGLISH_STOP_WORDS)
61
- @stop_words = stop_words
62
- end
63
-
64
- # Filters LowerCaseTokenizer with StopFilter.
65
- def token_stream(field, string)
66
- return StopFilter.new(LowerCaseTokenizer.new(string), @stop_words)
67
- end
68
- end
69
-
70
- # An Analyzer that filters LetterTokenizer with LowerCaseFilter.
71
- # This analyzer subclasses the StopAnalyzer so you can add your own
72
- # stoplist the same way. See StopAnalyzer.
73
- class StandardAnalyzer < StopAnalyzer
74
- def token_stream(field, string)
75
- return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
76
- end
77
- end
78
-
79
-
80
- # This analyzer is used to facilitate scenarios where different
81
- # fields require different analysis techniques. Use #add_analyzer
82
- # to add a non-default analyzer on a field name basis.
83
- # See tc_per_field_analyzer_wrapper for example usage.
84
- class PerFieldAnalyzerWrapper < Analyzer
85
-
86
- # Constructs with default analyzer.
87
- #
88
- # default_analyzer:: Any fields not specifically defined to use a
89
- # different analyzer will use the one provided here.
90
- def initialize(default_analyzer)
91
- @default_analyzer = default_analyzer
92
- @analyzers = {}
93
- end
94
-
95
- # Defines an analyzer to use for the specified field.
96
- #
97
- # field:: field name requiring a non-default analyzer.
98
- # analyzer:: non-default analyzer to use for field
99
- def add_analyzer(field, analyzer)
100
- @analyzers[field] = analyzer
101
- end
102
-
103
- def token_stream(field, string)
104
- analyzer = @analyzers[field]
105
- if (analyzer == nil)
106
- analyzer = @default_analyzer
107
- end
108
-
109
- return analyzer.token_stream(field, string)
110
- end
111
- end
112
- end
@@ -1,71 +0,0 @@
1
- if __FILE__ == $0
2
- module Ferret
3
- end
4
- $:.unshift File.dirname(__FILE__)
5
- require 'token_stream'
6
- require 'tokenizers'
7
- require 'token'
8
- end
9
-
10
- module Ferret::Analysis
11
- # The standard tokenizer is an advanced tokenizer which tokenizes morst
12
- # words correctly as well as tokenizing things like email addresses, web
13
- # addresses, phone numbers, etc.
14
-
15
- class StandardTokenizer < RegExpTokenizer
16
- ALPHA = /[[:alpha:]_-]+/
17
- APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
- ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
- P = /[_\/.,-]/
20
- HASDIGIT = /\w*\d\w*/
21
- TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
22
- |\.(#{ALPHA}\.)+
23
- |(@|\&)\w+([-.]\w+)*
24
- |:\/\/\w+([-.\/]\w+)*
25
- )
26
- |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
27
- |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
28
- |(\.\w+)+
29
- |
30
- )
31
- /x
32
-
33
- ACRONYM_WORD = /^#{ACRONYM}$/
34
- APOSTROPHE_WORD = /^#{APOSTROPHE}$/
35
- DOT = /\./
36
- APOSTROPHE_S = /'[sS]$/
37
- protected
38
-
39
- # Collects only characters which are not spaces tabs or carraige returns
40
- def token_re()
41
- #/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
42
- # This is a simplified version of the original Lucene standard
43
- # tokenizer. I think it works better. I hope so anyway. Any way to
44
- # do this more neatly?
45
- TOKEN_RE
46
- end
47
-
48
- # stem the 's and remove the '.'s from acronyms
49
- def normalize(str)
50
- if str =~ ACRONYM_WORD
51
- str.gsub!(DOT, '')
52
- elsif str =~ APOSTROPHE_WORD
53
- str.gsub!(APOSTROPHE_S, '')
54
- end
55
- str
56
- end
57
- end
58
- end
59
-
60
- # Add this so we can play around with the standard tokenizer
61
- if __FILE__ == $0
62
- st = "\033[7m"
63
- en = "\033[m"
64
-
65
- $stdin.each do |line|
66
- stk = Ferret::Analysis::StandardTokenizer.new(line)
67
- while tk = stk.next()
68
- puts " <" + tk.text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
69
- end
70
- end
71
- end
@@ -1,100 +0,0 @@
1
- module Ferret::Analysis
2
- # A Token is an occurence of a term from the text of a field. It consists
3
- # of a term's text, the start and end offset of the term in the text of the
4
- # field, and a type string.
5
- #
6
- # The start and end offsets permit applications to re-associate a token with
7
- # its source text, e.g., to display highlighted query terms in a document
8
- # browser, or to show matching text fragments in a KWIC (KeyWord In Context)
9
- # display, etc.
10
- #
11
- # The type is an interned string, assigned by a lexical analyzer (a.k.a.
12
- # tokenizer), naming the lexical or syntactic class that the token belongs
13
- # to. For example an end of sentence marker token might be implemented with
14
- # type "eos". The default token type is "word".
15
- #
16
- # start_offset:: is the position of the first character corresponding to
17
- # this token in the source text
18
- # end_offset:: is equal to one greater than the position of the last
19
- # character corresponding of this token Note that the
20
- # difference between @end_offset and @start_offset may not be
21
- # equal to @text.length(), as the term text may have been
22
- # altered by a stemmer or some other filter.
23
- class Token
24
- include Comparable
25
- attr_accessor :text
26
- attr_reader :pos_inc, :start_offset, :end_offset, :type
27
-
28
- # Constructs a Token with the given term text, and start & end offsets.
29
- # The type defaults to "word."
30
- def initialize(txt, so, eo, pos_inc=1, typ="word")
31
- @text = txt
32
- @start_offset = so
33
- @end_offset = eo
34
- @type = typ # lexical type
35
- @pos_inc = pos_inc
36
- end
37
-
38
- def set!(txt, so, eo)
39
- @text = txt
40
- @start_offset = so
41
- @end_offset = eo
42
- self
43
- end
44
-
45
- def eql?(o)
46
- return (o.instance_of?(Token) and @start_offset == o.start_offset and
47
- @end_offset == o.end_offset and @text == o.text)
48
- end
49
- alias :== :eql?
50
-
51
- # Tokens are sorted by the position in the text at which they occur, ie
52
- # the start_offset. If two tokens have the same start offset, (see
53
- # pos_inc=) then, they are sorted by the end_offset and then
54
- # lexically by the token text.
55
- def <=>(o)
56
- r = @start_offset <=> o.start_offset
57
- return r if r != 0
58
- r = @end_offset <=> o.end_offset
59
- return r if r != 0
60
- r = @text <=> o.text
61
- return r
62
- end
63
-
64
- # Set the position increment. This determines the position of this token
65
- # relative to the previous Token in a TokenStream, used in phrase
66
- # searching.
67
- #
68
- # The default value is one.
69
- #
70
- # Some common uses for this are:
71
- #
72
- # * Set it to zero to put multiple terms in the same position. This is
73
- # useful if, e.g., a word has multiple stems. Searches for phrases
74
- # including either stem will match. In this case, all but the first
75
- # stem's increment should be set to zero: the increment of the first
76
- # instance should be one. Repeating a token with an increment of zero
77
- # can also be used to boost the scores of matches on that token.
78
- #
79
- # * Set it to values greater than one to inhibit exact phrase matches.
80
- # If, for example, one does not want phrases to match across removed
81
- # stop words, then one could build a stop word filter that removes stop
82
- # words and also sets the increment to the number of stop words removed
83
- # before each non-stop word. Then exact phrase queries will only match
84
- # when the terms occur with no intervening stop words.
85
- def pos_inc=(pos_inc)
86
- if (pos_inc < 0)
87
- raise ArgumentError, "Increment must be zero or greater: " + pos_inc
88
- end
89
- @pos_inc = pos_inc
90
- end
91
-
92
- # Returns a string representation of the token with all the attributes.
93
- def to_s
94
- buf = "#{text}:#{start_offset}->#{end_offset}"
95
- buf << "(pos_inc=#{@pos_inc})" if (@pos_inc != 1)
96
- buf << "(type=#{@type})" if (@type != "word")
97
- buf
98
- end
99
- end
100
- end