ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -0,0 +1,157 @@
1
+ require 'date'
2
+ require 'time'
3
+
4
+ class Float
5
+ # Return true if the float is within +precision+ of the other value +o+. This
6
+ # is used to accomodate for floating point errors.
7
+ #
8
+ # o:: value to compare with
9
+ # precision:: the precision to use in the comparison.
10
+ # return:: true if the match is within +precision+
11
+ def =~(o, precision = 0.0000000001)
12
+ return (1 - self/o).abs < precision
13
+ end
14
+ end
15
+
16
+ # Provides support for converting integers to Strings, and back again. The
17
+ # strings are structured so that lexicographic sorting order is preserved.
18
+ #
19
+ # That is, if integer1 is less than integer2 for any two integers integer1 and
20
+ # integer2, then integer1.to_s_lex is lexicographically less than
21
+ # integer2.to_s_lex. (Similarly for "greater than" and "equals".)
22
+ #
23
+ # This class handles numbers between - 10 ** 10,000 and 10 ** 10,000
24
+ # which should cover all practical numbers. If you need bigger numbers,
25
+ # increase Integer::LEN_STR_SIZE.
26
+ class Integer
27
+ # LEN_SIZE of 4 should handle most numbers that can practically be held in
28
+ # memory.
29
+ LEN_STR_SIZE = 4
30
+ NEG_LEN_MASK = 10 ** LEN_STR_SIZE
31
+ LEN_STR_TEMPLATE = "%0#{LEN_STR_SIZE}d"
32
+
33
+ # Convert the number to a lexicographically sortable string. This string will
34
+ # use printable characters only but will not be human readable.
35
+ def to_s_lex
36
+ if (self >= 0)
37
+ num_str = self.to_s
38
+ len_str = LEN_STR_TEMPLATE % num_str.size
39
+ return len_str + num_str
40
+ else
41
+ num = self * -1
42
+ num_str = num.to_s
43
+ num_len = num_str.size
44
+ len_str = LEN_STR_TEMPLATE % (NEG_LEN_MASK - num_len)
45
+ num = (10 ** num_str.size) - num
46
+ return "-#{len_str}%0#{num_len}d" % num
47
+ end
48
+ end
49
+
50
+ # Convert the number to a lexicographically sortable string by padding with
51
+ # 0s. You should make sure that you set the width to a number large enough to
52
+ # accomodate all possible values. Also note that this method will not work
53
+ # with negative numbers. That is negative numbers will sort in the opposite
54
+ # direction as positive numbers. If you have very large numbers or a mix of
55
+ # positive and negative numbers you should use the Integer#to_s_lex method
56
+ #
57
+ # width:: number of characters in the string returned. Default is 10. So
58
+ # 123.to_s_pad(5) => 00123 and -123.to_s_pad(5) => -0123
59
+ # return:: padding string representation of the number.
60
+ def to_s_pad(width = 10)
61
+ "%#{width}d" % self
62
+ end
63
+ end
64
+
65
+ class Date
66
+ # Convert the Date to a lexicographically sortable string with the required
67
+ # precision. The format used is %Y%m%d
68
+ #
69
+ # precision:: the precision required in the string version of the date. The
70
+ # options are :year, :month and :day
71
+ # return:: a lexicographically sortable string representing the date
72
+ def to_s_lex(precision = :day)
73
+ self.strftime(Time::LEX_FORMAT[precision])
74
+ end
75
+ end
76
+
77
+ class DateTime
78
+ # Convert the DateTime to a lexicographically sortable string with the
79
+ # required precision. The format used is %Y%m%d %H:%M:%S.
80
+ #
81
+ # precision:: the precision required in the string version of the date. The
82
+ # options are :year, :month, :day, :hour, :minute and :second
83
+ # return:: a lexicographically sortable string representing the date
84
+ def to_s_lex(precision = :day)
85
+ self.strftime(Time::LEX_FORMAT[precision])
86
+ end
87
+ end
88
+
89
+ class Time
90
+ LEX_FORMAT = {
91
+ :year => "%Y",
92
+ :month => "%Y-%m",
93
+ :day => "%Y-%m-%d",
94
+ :hour => "%Y-%m-%d %H",
95
+ :minute => "%Y-%m-%d %H:%M",
96
+ :second => "%Y-%m-%d %H:%M:%S",
97
+ :millisecond => "%Y-%m-%d %H:%M:%S"
98
+ }
99
+
100
+ # Convert the Time to a lexicographically sortable string with the required
101
+ # precision. The format used is %Y%m%d %H:%M:%S.
102
+ #
103
+ # precision:: the precision required in the string version of the time. The
104
+ # options are :year, :month, :day, :hour, :minute and :second
105
+ # return:: a lexicographically sortable string representing the date
106
+ def to_s_lex(precision = :day)
107
+ self.strftime(LEX_FORMAT[precision])
108
+ end
109
+ end
110
+
111
+ class String
112
+ # Convert a string to an integer. This method will only work on strings that
113
+ # were previously created with Integer#to_s_lex, otherwise the result will be
114
+ # unpredictable.
115
+ def to_i_lex
116
+ if (self[0] == ?-)
117
+ return self[(Integer::LEN_STR_SIZE + 1)..-1].to_i -
118
+ 10 ** (self.size - Integer::LEN_STR_SIZE - 1)
119
+ else
120
+ return self[Integer::LEN_STR_SIZE..-1].to_i
121
+ end
122
+ end
123
+
124
+ # Convert a string to a Time. This method will only work on strings that
125
+ # match the format %Y%m%d %H%M%S, otherwise the result will be unpredictable.
126
+ def to_time_lex
127
+ vals = []
128
+ self.gsub(/(?:^|[- :])(\d+)/) {vals << $1.to_i; $&}
129
+ Time.mktime(*vals)
130
+ end
131
+
132
+ # Convert a string to a Date. This method will only work on strings that
133
+ # match the format %Y%m%d %H%M%S, otherwise the result will be unpredictable.
134
+ def to_date_lex
135
+ return Date.strptime(self + "-02-01", "%Y-%m-%d")
136
+ end
137
+
138
+ # Convert a string to a DateTime. This method will only work on strings that
139
+ # match the format %Y%m%d %H%M%S, otherwise the result will be unpredictable.
140
+ def to_date_time_lex
141
+ return DateTime.strptime(self + "-01-01", "%Y-%m-%d %H:%M:%S")
142
+ end
143
+
144
+ private
145
+
146
+ def get_lex_format(len)
147
+ case len
148
+ when 0.. 3: ""
149
+ when 4.. 5: "%Y"
150
+ when 6.. 7: "%Y%m"
151
+ when 8.. 9: "%Y%m%d"
152
+ when 10..11: "%Y%m%d%H"
153
+ when 12..13: "%Y%m%d%H%M"
154
+ else "%Y%m%d%H%M%S"
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,3 @@
1
+ module Ferret
2
+ VERSION = '0.9.4'
3
+ end
data/test/test_helper.rb CHANGED
@@ -9,20 +9,12 @@ class Float
9
9
  end
10
10
 
11
11
  require 'test/unit'
12
- require 'unit/index/th_doc'
13
- if $ferret_pure_ruby
14
- require 'rferret'
15
- else
16
- require 'ferret'
17
- end
12
+ require 'ferret'
13
+ require 'unit/index/th_doc' if (defined?(IndexTestHelper).nil?)
14
+
18
15
 
19
16
  def load_test_dir(dir)
20
- dir = File.join(File.dirname(__FILE__), dir)
21
- Dir.foreach(dir) do |file|
22
- if $ferret_pure_ruby
23
- require File.join(dir, file) if file =~ /^r?t[mcs]_.*\.rb$/
24
- else
25
- require File.join(dir, file) if file =~ /^c?t[mcs]_.*\.rb$/
26
- end
17
+ Dir[File.join(File.dirname(__FILE__), dir, "t[scm]*.rb")].each do |file|
18
+ require file
27
19
  end
28
20
  end
@@ -3,10 +3,11 @@ require File.dirname(__FILE__) + "/../../test_helper"
3
3
  class AnalyzerTest < Test::Unit::TestCase
4
4
  include Ferret::Analysis
5
5
 
6
- def test_analyzer()
6
+ def test_c_analyzer()
7
7
  input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
8
8
  a = Analyzer.new()
9
9
  t = a.token_stream("fieldname", input)
10
+ t2 = a.token_stream("fieldname", input)
10
11
  assert_equal(Token.new("dbalmain", 0, 8), t.next())
11
12
  assert_equal(Token.new("gmail", 9, 14), t.next())
12
13
  assert_equal(Token.new("com", 15, 18), t.next())
@@ -16,5 +17,516 @@ class AnalyzerTest < Test::Unit::TestCase
16
17
  assert_equal(Token.new("mail", 27, 31), t.next())
17
18
  assert_equal(Token.new("address", 39, 46), t.next())
18
19
  assert(! t.next())
20
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
21
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
22
+ assert_equal(Token.new("com", 15, 18), t2.next())
23
+ assert_equal(Token.new("is", 19, 21), t2.next())
24
+ assert_equal(Token.new("my", 22, 24), t2.next())
25
+ assert_equal(Token.new("e", 25, 26), t2.next())
26
+ assert_equal(Token.new("mail", 27, 31), t2.next())
27
+ assert_equal(Token.new("address", 39, 46), t2.next())
28
+ assert(! t2.next())
29
+ a = Analyzer.new(false)
30
+ t = a.token_stream("fieldname", input)
31
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
32
+ assert_equal(Token.new("gmail", 9, 14), t.next())
33
+ assert_equal(Token.new("com", 15, 18), t.next())
34
+ assert_equal(Token.new("is", 19, 21), t.next())
35
+ assert_equal(Token.new("My", 22, 24), t.next())
36
+ assert_equal(Token.new("E", 25, 26), t.next())
37
+ assert_equal(Token.new("Mail", 27, 31), t.next())
38
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
39
+ assert(! t.next())
40
+ end
41
+ end
42
+
43
+ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
44
+ include Ferret::Analysis
45
+
46
+ def test_c_letter_analyzer()
47
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
48
+ a = AsciiLetterAnalyzer.new()
49
+ t = a.token_stream("fieldname", input)
50
+ t2 = a.token_stream("fieldname", input)
51
+ assert_equal(Token.new("dbalmain", 0, 8), t.next())
52
+ assert_equal(Token.new("gmail", 9, 14), t.next())
53
+ assert_equal(Token.new("com", 15, 18), t.next())
54
+ assert_equal(Token.new("is", 19, 21), t.next())
55
+ assert_equal(Token.new("my", 22, 24), t.next())
56
+ assert_equal(Token.new("e", 25, 26), t.next())
57
+ assert_equal(Token.new("mail", 27, 31), t.next())
58
+ assert_equal(Token.new("address", 39, 46), t.next())
59
+ assert(! t.next())
60
+ assert_equal(Token.new("dbalmain", 0, 8), t2.next())
61
+ assert_equal(Token.new("gmail", 9, 14), t2.next())
62
+ assert_equal(Token.new("com", 15, 18), t2.next())
63
+ assert_equal(Token.new("is", 19, 21), t2.next())
64
+ assert_equal(Token.new("my", 22, 24), t2.next())
65
+ assert_equal(Token.new("e", 25, 26), t2.next())
66
+ assert_equal(Token.new("mail", 27, 31), t2.next())
67
+ assert_equal(Token.new("address", 39, 46), t2.next())
68
+ assert(! t2.next())
69
+ a = AsciiLetterAnalyzer.new(false)
70
+ t = a.token_stream("fieldname", input)
71
+ assert_equal(Token.new("DBalmain", 0, 8), t.next())
72
+ assert_equal(Token.new("gmail", 9, 14), t.next())
73
+ assert_equal(Token.new("com", 15, 18), t.next())
74
+ assert_equal(Token.new("is", 19, 21), t.next())
75
+ assert_equal(Token.new("My", 22, 24), t.next())
76
+ assert_equal(Token.new("E", 25, 26), t.next())
77
+ assert_equal(Token.new("Mail", 27, 31), t.next())
78
+ assert_equal(Token.new("ADDRESS", 39, 46), t.next())
79
+ assert(! t.next())
80
+ end
81
+ end
82
+
83
+ class LetterAnalyzerTest < Test::Unit::TestCase
84
+ include Ferret::Analysis
85
+
86
+ def test_c_letter_analyzer()
87
+ Ferret.locale = ""
88
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
89
+ a = LetterAnalyzer.new(false)
90
+ t = a.token_stream("fieldname", input)
91
+ t2 = a.token_stream("fieldname", input)
92
+ assert_equal(Token.new("DBalmän", 0, 8), t.next)
93
+ assert_equal(Token.new("gmail", 9, 14), t.next)
94
+ assert_equal(Token.new("com", 15, 18), t.next)
95
+ assert_equal(Token.new("is", 19, 21), t.next)
96
+ assert_equal(Token.new("My", 22, 24), t.next)
97
+ assert_equal(Token.new("e", 25, 26), t.next)
98
+ assert_equal(Token.new("mail", 27, 31), t.next)
99
+ assert_equal(Token.new("address", 40, 47), t.next)
100
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t.next)
101
+ assert_equal(Token.new("ÊËÌ", 64, 70), t.next)
102
+ assert_equal(Token.new("ÚØÃ", 72, 78), t.next)
103
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t.next)
104
+ assert(! t.next())
105
+ assert_equal(Token.new("DBalmän", 0, 8), t2.next)
106
+ assert_equal(Token.new("gmail", 9, 14), t2.next)
107
+ assert_equal(Token.new("com", 15, 18), t2.next)
108
+ assert_equal(Token.new("is", 19, 21), t2.next)
109
+ assert_equal(Token.new("My", 22, 24), t2.next)
110
+ assert_equal(Token.new("e", 25, 26), t2.next)
111
+ assert_equal(Token.new("mail", 27, 31), t2.next)
112
+ assert_equal(Token.new("address", 40, 47), t2.next)
113
+ assert_equal(Token.new("ÁÄGÇ", 55, 62), t2.next)
114
+ assert_equal(Token.new("ÊËÌ", 64, 70), t2.next)
115
+ assert_equal(Token.new("ÚØÃ", 72, 78), t2.next)
116
+ assert_equal(Token.new("ÖÎÍ", 80, 86), t2.next)
117
+ assert(! t2.next())
118
+ a = LetterAnalyzer.new()
119
+ t = a.token_stream("fieldname", input)
120
+ assert_equal(Token.new("dbalmän", 0, 8), t.next)
121
+ assert_equal(Token.new("gmail", 9, 14), t.next)
122
+ assert_equal(Token.new("com", 15, 18), t.next)
123
+ assert_equal(Token.new("is", 19, 21), t.next)
124
+ assert_equal(Token.new("my", 22, 24), t.next)
125
+ assert_equal(Token.new("e", 25, 26), t.next)
126
+ assert_equal(Token.new("mail", 27, 31), t.next)
127
+ assert_equal(Token.new("address", 40, 47), t.next)
128
+ assert_equal(Token.new("áägç", 55, 62), t.next)
129
+ assert_equal(Token.new("êëì", 64, 70), t.next)
130
+ assert_equal(Token.new("úøã", 72, 78), t.next)
131
+ assert_equal(Token.new("öîí", 80, 86), t.next)
132
+ assert(! t.next())
133
+ end
134
+ end
135
+
136
+ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
137
+ include Ferret::Analysis
138
+
139
+ def test_c_white_space_analyzer()
140
+ input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#@$'
141
+ a = AsciiWhiteSpaceAnalyzer.new()
142
+ t = a.token_stream("fieldname", input)
143
+ t2 = a.token_stream("fieldname", input)
144
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
145
+ assert_equal(Token.new('is', 19, 21), t.next)
146
+ assert_equal(Token.new('My', 22, 24), t.next)
147
+ assert_equal(Token.new('E-Mail', 25, 31), t.next)
148
+ assert_equal(Token.new('52', 32, 34), t.next)
149
+ assert_equal(Token.new('#$', 37, 39), t.next)
150
+ assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
151
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
152
+ assert(! t.next())
153
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
154
+ assert_equal(Token.new('is', 19, 21), t2.next)
155
+ assert_equal(Token.new('My', 22, 24), t2.next)
156
+ assert_equal(Token.new('E-Mail', 25, 31), t2.next)
157
+ assert_equal(Token.new('52', 32, 34), t2.next)
158
+ assert_equal(Token.new('#$', 37, 39), t2.next)
159
+ assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
160
+ assert_equal(Token.new('23#@$', 49, 54), t2.next)
161
+ assert(! t2.next())
162
+ a = AsciiWhiteSpaceAnalyzer.new(true)
163
+ t = a.token_stream("fieldname", input)
164
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
165
+ assert_equal(Token.new('is', 19, 21), t.next)
166
+ assert_equal(Token.new('my', 22, 24), t.next)
167
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
168
+ assert_equal(Token.new('52', 32, 34), t.next)
169
+ assert_equal(Token.new('#$', 37, 39), t.next)
170
+ assert_equal(Token.new('address.', 40, 48), t.next)
171
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
172
+ assert(! t.next())
173
+ end
174
+ end
175
+
176
+ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
177
+ include Ferret::Analysis
178
+
179
+ def test_c_white_space_analyzer()
180
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
181
+ a = WhiteSpaceAnalyzer.new()
182
+ t = a.token_stream("fieldname", input)
183
+ t2 = a.token_stream("fieldname", input)
184
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
185
+ assert_equal(Token.new('is', 19, 21), t.next)
186
+ assert_equal(Token.new('My', 22, 24), t.next)
187
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
188
+ assert_equal(Token.new('52', 32, 34), t.next)
189
+ assert_equal(Token.new('#$', 37, 39), t.next)
190
+ assert_equal(Token.new('address.', 40, 48), t.next)
191
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
192
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
193
+ assert(! t.next())
194
+ assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
195
+ assert_equal(Token.new('is', 19, 21), t2.next)
196
+ assert_equal(Token.new('My', 22, 24), t2.next)
197
+ assert_equal(Token.new('e-mail', 25, 31), t2.next)
198
+ assert_equal(Token.new('52', 32, 34), t2.next)
199
+ assert_equal(Token.new('#$', 37, 39), t2.next)
200
+ assert_equal(Token.new('address.', 40, 48), t2.next)
201
+ assert_equal(Token.new('23#@$', 49, 54), t2.next)
202
+ assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t2.next)
203
+ assert(! t2.next())
204
+ a = WhiteSpaceAnalyzer.new(true)
205
+ t = a.token_stream("fieldname", input)
206
+ assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
207
+ assert_equal(Token.new('is', 19, 21), t.next)
208
+ assert_equal(Token.new('my', 22, 24), t.next)
209
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
210
+ assert_equal(Token.new('52', 32, 34), t.next)
211
+ assert_equal(Token.new('#$', 37, 39), t.next)
212
+ assert_equal(Token.new('address.', 40, 48), t.next)
213
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
214
+ assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
215
+ assert(! t.next())
216
+ end
217
+ end
218
+
219
+ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
220
+ include Ferret::Analysis
221
+
222
+ def test_c_standard_analyzer()
223
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
224
+ a = AsciiStandardAnalyzer.new()
225
+ t = a.token_stream("fieldname", input)
226
+ t2 = a.token_stream("fieldname", input)
227
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
228
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
229
+ assert_equal(Token.new('52', 32, 34), t.next)
230
+ assert_equal(Token.new('address', 40, 47), t.next)
231
+ assert_equal(Token.new('23', 49, 51), t.next)
232
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
233
+ assert_equal(Token.new('tnt', 86, 91), t.next)
234
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
235
+ assert(! t.next())
236
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t2.next)
237
+ assert_equal(Token.new('e-mail', 25, 31), t2.next)
238
+ assert_equal(Token.new('52', 32, 34), t2.next)
239
+ assert_equal(Token.new('address', 40, 47), t2.next)
240
+ assert_equal(Token.new('23', 49, 51), t2.next)
241
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
242
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
243
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
244
+ assert(! t2.next())
245
+ a = AsciiStandardAnalyzer.new(false)
246
+ t = a.token_stream("fieldname", input)
247
+ t2 = a.token_stream("fieldname", input)
248
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
249
+ assert_equal(Token.new('My', 22, 24), t.next)
250
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
251
+ assert_equal(Token.new('52', 32, 34), t.next)
252
+ assert_equal(Token.new('Address', 40, 47), t.next)
253
+ assert_equal(Token.new('23', 49, 51), t.next)
254
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
255
+ assert_equal(Token.new('TNT', 86, 91), t.next)
256
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
257
+ assert(! t.next())
258
+ end
259
+ end
260
+
261
+ class StandardAnalyzerTest < Test::Unit::TestCase
262
+ include Ferret::Analysis
263
+
264
+ def test_c_standard_analyzer()
265
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
266
+ a = StandardAnalyzer.new()
267
+ t = a.token_stream("fieldname", input)
268
+ t2 = a.token_stream("fieldname", input)
269
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
270
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
271
+ assert_equal(Token.new('address', 40, 47), t.next)
272
+ assert_equal(Token.new('23', 49, 51), t.next)
273
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
274
+ assert_equal(Token.new('tnt', 86, 91), t.next)
275
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
276
+ assert_equal(Token.new('23', 111, 113), t.next)
277
+ assert_equal(Token.new('áägç', 117, 124), t.next)
278
+ assert_equal(Token.new('êëì', 126, 132), t.next)
279
+ assert_equal(Token.new('úøã', 134, 140), t.next)
280
+ assert_equal(Token.new('öîí', 142, 148), t.next)
281
+ assert(! t.next())
282
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
283
+ assert_equal(Token.new('e-mail', 25, 31), t2.next)
284
+ assert_equal(Token.new('address', 40, 47), t2.next)
285
+ assert_equal(Token.new('23', 49, 51), t2.next)
286
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
287
+ assert_equal(Token.new('tnt', 86, 91), t2.next)
288
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
289
+ assert_equal(Token.new('23', 111, 113), t2.next)
290
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
291
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
292
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
293
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
294
+ assert(! t2.next())
295
+ a = StandardAnalyzer.new(nil, false)
296
+ t = a.token_stream("fieldname", input)
297
+ assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
298
+ assert_equal(Token.new('My', 22, 24), t.next)
299
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
300
+ assert_equal(Token.new('Address', 40, 47), t.next)
301
+ assert_equal(Token.new('23', 49, 51), t.next)
302
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
303
+ assert_equal(Token.new('TNT', 86, 91), t.next)
304
+ assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
305
+ assert_equal(Token.new('23', 111, 113), t.next)
306
+ assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
307
+ assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
308
+ assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
309
+ assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
310
+ assert(! t.next())
311
+ a = StandardAnalyzer.new(["e-mail", "23", "tnt"])
312
+ t = a.token_stream("fieldname", input)
313
+ t2 = a.token_stream("fieldname", input)
314
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
315
+ assert_equal(Token.new('is', 19, 21), t.next)
316
+ assert_equal(Token.new('my', 22, 24), t.next)
317
+ assert_equal(Token.new('and', 32, 35), t.next)
318
+ assert_equal(Token.new('the', 36, 39), t.next)
319
+ assert_equal(Token.new('address', 40, 47), t.next)
320
+ assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
321
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
322
+ assert_equal(Token.new('áägç', 117, 124), t.next)
323
+ assert_equal(Token.new('êëì', 126, 132), t.next)
324
+ assert_equal(Token.new('úøã', 134, 140), t.next)
325
+ assert_equal(Token.new('öîí', 142, 148), t.next)
326
+ assert(! t.next())
327
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t2.next)
328
+ assert_equal(Token.new('is', 19, 21), t2.next)
329
+ assert_equal(Token.new('my', 22, 24), t2.next)
330
+ assert_equal(Token.new('and', 32, 35), t2.next)
331
+ assert_equal(Token.new('the', 36, 39), t2.next)
332
+ assert_equal(Token.new('address', 40, 47), t2.next)
333
+ assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
334
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
335
+ assert_equal(Token.new('áägç', 117, 124), t2.next)
336
+ assert_equal(Token.new('êëì', 126, 132), t2.next)
337
+ assert_equal(Token.new('úøã', 134, 140), t2.next)
338
+ assert_equal(Token.new('öîí', 142, 148), t2.next)
339
+ assert(! t2.next())
340
+ end
341
+ end
342
+
343
+ class PerFieldAnalyzerTest < Test::Unit::TestCase
344
+ include Ferret::Analysis
345
+ def test_c_per_field_analyzer()
346
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$'
347
+ pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
348
+ pfa['white'] = WhiteSpaceAnalyzer.new(false)
349
+ pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
350
+ pfa['letter'] = LetterAnalyzer.new(false)
351
+ pfa.add_field('letter', LetterAnalyzer.new(true))
352
+ pfa.add_field('letter_u', LetterAnalyzer.new(false))
353
+ t = pfa.token_stream('white', input)
354
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
355
+ assert_equal(Token.new('is', 19, 21), t.next)
356
+ assert_equal(Token.new('My', 22, 24), t.next)
357
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
358
+ assert_equal(Token.new('52', 32, 34), t.next)
359
+ assert_equal(Token.new('#$', 37, 39), t.next)
360
+ assert_equal(Token.new('address.', 40, 48), t.next)
361
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
362
+ assert(! t.next())
363
+ t = pfa.token_stream('white_l', input)
364
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
365
+ assert_equal(Token.new('is', 19, 21), t.next)
366
+ assert_equal(Token.new('my', 22, 24), t.next)
367
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
368
+ assert_equal(Token.new('52', 32, 34), t.next)
369
+ assert_equal(Token.new('#$', 37, 39), t.next)
370
+ assert_equal(Token.new('address.', 40, 48), t.next)
371
+ assert_equal(Token.new('23#@$', 49, 54), t.next)
372
+ assert(! t.next())
373
+ t = pfa.token_stream('letter_u', input)
374
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
375
+ assert_equal(Token.new('gmail', 9, 14), t.next)
376
+ assert_equal(Token.new('com', 15, 18), t.next)
377
+ assert_equal(Token.new('is', 19, 21), t.next)
378
+ assert_equal(Token.new('My', 22, 24), t.next)
379
+ assert_equal(Token.new('e', 25, 26), t.next)
380
+ assert_equal(Token.new('mail', 27, 31), t.next)
381
+ assert_equal(Token.new('address', 40, 47), t.next)
382
+ assert(! t.next())
383
+ t = pfa.token_stream('letter', input)
384
+ assert_equal(Token.new('dbalmain', 0, 8), t.next)
385
+ assert_equal(Token.new('gmail', 9, 14), t.next)
386
+ assert_equal(Token.new('com', 15, 18), t.next)
387
+ assert_equal(Token.new('is', 19, 21), t.next)
388
+ assert_equal(Token.new('my', 22, 24), t.next)
389
+ assert_equal(Token.new('e', 25, 26), t.next)
390
+ assert_equal(Token.new('mail', 27, 31), t.next)
391
+ assert_equal(Token.new('address', 40, 47), t.next)
392
+ assert(! t.next())
393
+ t = pfa.token_stream('XXX', input) # should use default StandardAnalzyer
394
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
395
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
396
+ assert_equal(Token.new('52', 32, 34), t.next)
397
+ assert_equal(Token.new('address', 40, 47), t.next)
398
+ assert_equal(Token.new('23', 49, 51), t.next)
399
+ assert(! t.next())
400
+ end
401
+ end
402
+
403
+ class RegExpAnalyzerTest < Test::Unit::TestCase
404
+ include Ferret::Analysis
405
+
406
+ def test_reg_exp_analyzer()
407
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
408
+ a = RegExpAnalyzer.new()
409
+ t = a.token_stream('XXX', input)
410
+ t2 = a.token_stream('XXX', "one_Two three")
411
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
412
+ assert_equal(Token.new('is', 19, 21), t.next)
413
+ assert_equal(Token.new('my', 22, 24), t.next)
414
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
415
+ assert_equal(Token.new('52', 32, 34), t.next)
416
+ assert_equal(Token.new('address', 40, 47), t.next)
417
+ assert_equal(Token.new('23', 49, 51), t.next)
418
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
419
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
420
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
421
+ assert_equal(Token.new('23', 116, 118), t.next)
422
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
423
+ assert(! t.next())
424
+ t = t2
425
+ assert_equal(Token.new("one_two", 0, 7), t.next())
426
+ assert_equal(Token.new("three", 8, 13), t.next())
427
+ assert(! t.next())
428
+ a = RegExpAnalyzer.new(/\w{2,}/, false)
429
+ t = a.token_stream('XXX', input)
430
+ t2 = a.token_stream('XXX', "one Two three")
431
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
432
+ assert_equal(Token.new('gmail', 9, 14), t.next)
433
+ assert_equal(Token.new('com', 15, 18), t.next)
434
+ assert_equal(Token.new('is', 19, 21), t.next)
435
+ assert_equal(Token.new('My', 22, 24), t.next)
436
+ assert_equal(Token.new('mail', 27, 31), t.next)
437
+ assert_equal(Token.new('52', 32, 34), t.next)
438
+ assert_equal(Token.new('Address', 40, 47), t.next)
439
+ assert_equal(Token.new('23', 49, 51), t.next)
440
+ assert_equal(Token.new('http', 55, 59), t.next)
441
+ assert_equal(Token.new('www', 62, 65), t.next)
442
+ assert_equal(Token.new('google', 66, 72), t.next)
443
+ assert_equal(Token.new('com', 73, 76), t.next)
444
+ assert_equal(Token.new('RESULT_3', 77, 85), t.next)
445
+ assert_equal(Token.new('html', 86, 90), t.next)
446
+ assert_equal(Token.new('123', 98, 101), t.next)
447
+ assert_equal(Token.new('1235', 102, 106), t.next)
448
+ assert_equal(Token.new('ASD', 107, 110), t.next)
449
+ assert_equal(Token.new('1234', 111, 115), t.next)
450
+ assert_equal(Token.new('23', 116, 118), t.next)
451
+ assert_equal(Token.new('Rob', 119, 122), t.next)
452
+ assert(! t.next())
453
+ assert_equal(Token.new("one", 0, 3), t2.next())
454
+ assert_equal(Token.new("Two", 4, 7), t2.next())
455
+ assert_equal(Token.new("three", 8, 13), t2.next())
456
+ assert(! t2.next())
457
+ a = RegExpAnalyzer.new() do |str|
458
+ if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
459
+ str.gsub!(/\./, '')
460
+ elsif str =~ /'[sS]$/
461
+ str.gsub!(/'[sS]$/, '')
462
+ end
463
+ str
464
+ end
465
+ t = a.token_stream('XXX', input)
466
+ t2 = a.token_stream('XXX', "one's don't T.N.T.")
467
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
468
+ assert_equal(Token.new('is', 19, 21), t.next)
469
+ assert_equal(Token.new('my', 22, 24), t.next)
470
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
471
+ assert_equal(Token.new('52', 32, 34), t.next)
472
+ assert_equal(Token.new('address', 40, 47), t.next)
473
+ assert_equal(Token.new('23', 49, 51), t.next)
474
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
475
+ assert_equal(Token.new('tnt', 91, 97), t.next)
476
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
477
+ assert_equal(Token.new('23', 116, 118), t.next)
478
+ assert_equal(Token.new('rob', 119, 124), t.next)
479
+ assert(! t.next())
480
+ assert_equal(Token.new("one", 0, 5), t2.next())
481
+ assert_equal(Token.new("don't", 6, 11), t2.next())
482
+ assert_equal(Token.new("tnt", 12, 18), t2.next())
483
+ assert(! t2.next())
484
+ end
485
+ end
486
+
487
+ module Ferret::Analysis
488
+ class StemmingStandardAnalyzer < StandardAnalyzer
489
+ def token_stream(field, text)
490
+ StemFilter.new(super)
491
+ end
492
+ end
493
+ end
494
+
495
+ class CustomAnalyzerTest < Test::Unit::TestCase
496
+ include Ferret::Analysis
497
+
498
+ def test_custom_filter()
499
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
500
+ a = StemmingStandardAnalyzer.new()
501
+ t = a.token_stream("fieldname", input)
502
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
503
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
504
+ assert_equal(Token.new('address', 40, 47), t.next)
505
+ assert_equal(Token.new('23', 49, 51), t.next)
506
+ assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
507
+ assert_equal(Token.new('tnt', 86, 91), t.next)
508
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
509
+ assert_equal(Token.new('23', 111, 113), t.next)
510
+ assert_equal(Token.new('áägç', 117, 124), t.next)
511
+ assert_equal(Token.new('êëì', 126, 132), t.next)
512
+ assert_equal(Token.new('úøã', 134, 140), t.next)
513
+ assert_equal(Token.new('öîí', 142, 148), t.next)
514
+ assert(! t.next())
515
+ input = "Debate Debates DEBATED DEBating Debater";
516
+ t = a.token_stream("fieldname", input)
517
+ assert_equal(Token.new("debat", 0, 6), t.next)
518
+ assert_equal(Token.new("debat", 7, 14), t.next)
519
+ assert_equal(Token.new("debat", 15, 22), t.next)
520
+ assert_equal(Token.new("debat", 23, 31), t.next)
521
+ assert_equal(Token.new("debat", 32, 39), t.next)
522
+ assert(! t.next())
523
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
524
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
525
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
526
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
527
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
528
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
529
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
530
+ assert(! t.next())
19
531
  end
20
532
  end