ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,74 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class SegmentInfosTest < Test::Unit::TestCase
5
- include Ferret::Index
6
-
7
- def setup()
8
- @dir = Ferret::Store::RAMDirectory.new
9
- end
10
-
11
- def tear_down()
12
- @dir.close()
13
- end
14
-
15
- def test_read_write
16
- assert_equal(0, SegmentInfos.read_current_version(@dir))
17
- sis = SegmentInfos.new()
18
- seg0 = SegmentInfo.new('seg0', 5, @dir)
19
- seg1 = SegmentInfo.new('seg1', 5, @dir)
20
- seg2 = SegmentInfo.new('seg2', 5, @dir)
21
- seg3 = SegmentInfo.new('seg3', 5, @dir)
22
- sis << seg0
23
- sis << seg1
24
- sis << seg2
25
- assert_equal(sis.size(), 3)
26
- assert_equal(sis[0], seg0)
27
- assert_equal(sis[2], seg2)
28
- sis.write(@dir)
29
- version = SegmentInfos.read_current_version(@dir)
30
- assert(@dir.exists?('segments'))
31
- sis2 = SegmentInfos.new()
32
- sis2.read(@dir)
33
- assert_equal(sis2.size(), 3)
34
- assert_equal(sis2[0], seg0)
35
- assert_equal(sis2[2], seg2)
36
- sis2 << seg3
37
- sis2.write(@dir)
38
- assert_equal(version + 1, SegmentInfos.read_current_version(@dir))
39
- sis3 = SegmentInfos.new()
40
- sis3.read(@dir)
41
- assert_equal(sis3.size(), 4)
42
- assert_equal(sis2[0], seg0)
43
- assert_equal(sis2[3], seg3)
44
- end
45
- end
46
-
47
- class SegmentInfoTest < Test::Unit::TestCase
48
- include Ferret::Index
49
-
50
- def setup()
51
- @dir = Ferret::Store::RAMDirectory.new
52
- end
53
-
54
- def tear_down()
55
- @dir.close()
56
- end
57
-
58
- # just test getters and setters. Nothing else.
59
- def test_segment_info
60
- si = SegmentInfo.new("seg1", 0, @dir)
61
- assert_equal(si.directory, @dir)
62
- assert_equal(si.doc_count, 0)
63
- assert_equal(si.name, "seg1")
64
- @dir.close()
65
- @dpath = File.dirname(__FILE__) + '/../../temp/fsdir'
66
- @dir = Ferret::Store::FSDirectory.new(@dpath, true)
67
- si.name = "seg2"
68
- si.doc_count += 2
69
- si.directory = @dir
70
- assert_equal(si.directory, @dir)
71
- assert_equal(si.doc_count, 2)
72
- assert_equal(si.name, "seg2")
73
- end
74
- end
@@ -1,17 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
- class SegmentTermDocEnumTest < Test::Unit::TestCase
4
-
5
- include Ferret::Index
6
- include Ferret::Analysis
7
-
8
- def setup()
9
- @dir = Ferret::Store::RAMDirectory.new
10
- @doc = IndexTestHelper.prepare_document()
11
- IndexTestHelper.write_document(@dir, @doc)
12
- end
13
-
14
- def test_something()
15
- assert true
16
- end
17
- end
@@ -1,60 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class SegmentTermEnumTest < Test::Unit::TestCase
5
- include Ferret::Index
6
-
7
- TEST_SEGMENT = "_test"
8
-
9
- def setup()
10
- @dir = Ferret::Store::RAMDirectory.new
11
- end
12
-
13
- def test_initialize()
14
- fis = FieldInfos.new
15
- fis.add("author", true, true)
16
- fis.add("title", true, true)
17
- tiw = TermInfosWriter.new(@dir, TEST_SEGMENT, fis, 128)
18
- terms = [ Term.new("author", "Martel"),
19
- Term.new("title", "Life of Pi"),
20
- Term.new("author", "Martin"),
21
- Term.new("title", "Life on the edge") ].sort
22
- term_infos = []
23
- 4.times {|i| term_infos << TermInfo.new(i,i,i,0)}
24
- 4.times {|i| tiw.add(terms[i], term_infos[i]) }
25
- tiw.close()
26
-
27
- tis_file = @dir.open_input(TEST_SEGMENT + ".tis")
28
-
29
- ste = SegmentTermEnum.new(tis_file, fis, false)
30
- assert_equal(128, ste.index_interval)
31
- assert_equal(16, ste.skip_interval)
32
- assert_equal(4, ste.size)
33
- assert(ste.next?)
34
- assert_equal(terms[0], ste.term)
35
- assert_equal(term_infos[0], ste.term_info)
36
- ti = TermInfo.new
37
- ste.term_info = ti
38
- assert_equal(term_infos[0], ti)
39
- assert(ste.next?)
40
- assert_equal(terms[0], ste.prev)
41
- assert_equal(terms[1], ste.term)
42
- assert_equal(term_infos[1], ste.term_info)
43
- assert(ste.next?)
44
- assert_equal(terms[2], ste.term)
45
- assert_equal(term_infos[2], ste.term_info)
46
- assert(ste.next?)
47
- assert_equal(terms[3], ste.term)
48
- assert_equal(term_infos[3], ste.term_info)
49
- ste.close()
50
-
51
- tii_file = @dir.open_input(TEST_SEGMENT + ".tii")
52
-
53
- ste = SegmentTermEnum.new(tii_file, fis, false)
54
- assert_equal(128, ste.index_interval)
55
- assert_equal(16, ste.skip_interval)
56
- assert_equal(1, ste.size)
57
- assert(ste.next?)
58
- assert(Term.new("", ""), ste.term)
59
- end
60
- end
@@ -1,71 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class SegmentTermVectorTest < Test::Unit::TestCase
5
-
6
- include Ferret::Index
7
-
8
- def setup()
9
- @terms = ["Apples", "Oranges", "Bananas", "Kiwis", "Mandarins"]
10
- term_freqs = [4,2,1,12,4]
11
- @stv = SegmentTermVector.new("Fruits", @terms, term_freqs)
12
- end
13
-
14
- def test_size()
15
- assert_equal(@terms.size(), @stv.size())
16
- end
17
-
18
- def test_index_of()
19
- assert_equal(0, @stv.index_of("Apples"))
20
- assert_equal(4, @stv.freqs[@stv.index_of("Apples")])
21
- end
22
-
23
- def test_indexes_of()
24
- assert_equal([2, 0, 3], @stv.indexes_of(["Bananas", "Apples", "Kiwis"], 0, 3))
25
- assert_equal([0, 3], @stv.indexes_of(["Bananas", "Apples", "Kiwis"], 1, 2))
26
- end
27
- end
28
-
29
- class SegmentTermVectorWithPosOffsetsTest < Test::Unit::TestCase
30
-
31
- include Ferret::Index
32
-
33
- def setup()
34
- @terms = ["Apples", "Oranges", "Bananas", "Kiwis", "Mandarins"]
35
- term_freqs = [4,2,1,12,4]
36
- term_positions = [
37
- [1,3,5,7],
38
- [2,4],
39
- [6],
40
- [8,9,10,12,13,14,16,17,18,20,21,22],
41
- [11,15,19,23]
42
- ]
43
- term_offsets = [
44
- [[1,4],[10,14],[20,24],[30,34]],
45
- [[5,9],[15,19]],
46
- [[25,29]],
47
- [[35,39],[40,44],[45,49],[55,59],[60,64],[65,69],[75,79],[80,84],[85,89],[95,99],[100,104],[105,109]],
48
- [[50,54],[70,74],[90,94],[110,114]]
49
- ]
50
- @stv = SegmentTermVector.new("Fruits", @terms, term_freqs, term_positions, term_offsets)
51
- end
52
-
53
- def test_size()
54
- assert_equal(@terms.size(), @stv.size())
55
- end
56
-
57
- def test_index_of()
58
- assert_equal(0, @stv.index_of("Apples"))
59
- assert_equal(4, @stv.freqs[@stv.index_of("Apples")])
60
- end
61
-
62
- def test_indexes_of()
63
- assert_equal([2, 0, 3], @stv.indexes_of(["Bananas", "Apples", "Kiwis"], 0, 3))
64
- assert_equal([0, 3], @stv.indexes_of(["Bananas", "Apples", "Kiwis"], 1, 2))
65
- end
66
-
67
- def test_positions_offsets()
68
- assert_equal([1,3,5,7], @stv.positions[@stv.index_of("Apples")])
69
- assert_equal([[35,39],[40,44],[45,49],[55,59],[60,64],[65,69],[75,79],[80,84],[85,89],[95,99],[100,104],[105,109]], @stv.offsets[@stv.index_of("Kiwis")])
70
- end
71
- end
@@ -1,57 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class TermBufferTest < Test::Unit::TestCase
5
- include Ferret::Index
6
- def test_term_set()
7
- t = Term.new("title", "Ferret Tutorial")
8
- tb = TermBuffer.new
9
- tb.term = t
10
- assert_equal(t.field, tb.field)
11
- assert_equal("Ferret Tutorial", tb.text)
12
- assert_equal("Ferret Tutorial".length, tb.text_length)
13
- assert_equal(t, tb.term)
14
- end
15
-
16
- def test_set()
17
- tb = TermBuffer.new
18
- tb.term = Term.new("title", "Ferret Tutorial")
19
- tb2 = TermBuffer.new
20
- tb2.set!(tb)
21
- assert_equal(tb.field, tb2.field)
22
- assert_equal("Ferret Tutorial", tb2.text)
23
- assert_equal("Ferret Tutorial".length, tb2.text_length)
24
- assert_equal(tb.term, tb2.term)
25
- end
26
-
27
- def test_compare()
28
- tb1 = TermBuffer.new
29
- tb2 = TermBuffer.new
30
- tb1.term = Term.new("alpha", "text")
31
- tb2.term = Term.new("bravo", "text")
32
- assert(tb1 < tb2)
33
- tb2.term = Term.new("alpha", "text")
34
- assert(tb1 == tb2)
35
- tb2.term = Term.new("alpha", "tex")
36
- assert(tb1 > tb2)
37
- end
38
-
39
- def test_read()
40
- dir = Ferret::Store::RAMDirectory.new
41
- fi = FieldInfos.new
42
- tb = TermBuffer.new
43
- tb.term = Term.new("Author", "Dave")
44
- fi.add("Writer", true)
45
- output = dir.create_output("term_buffer_read_test")
46
- output.write_vint(4)
47
- output.write_vint(8)
48
- output.write_chars(" Balmain", 0, 8)
49
- output.write_vint(fi.field_number("Writer"))
50
- output.close
51
- input = dir.open_input("term_buffer_read_test")
52
- tb.read(input, fi)
53
- assert_equal("Dave Balmain", tb.text)
54
- assert_equal("Dave Balmain", tb.term.text)
55
- assert_equal("Writer", tb.field)
56
- end
57
- end
@@ -1,19 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class TermInfoTest < Test::Unit::TestCase
5
- include Ferret::Index
6
- def test_term()
7
- ti1 = TermInfo.new(1, 2, 3, 1)
8
- assert_equal(ti1.doc_freq, 1)
9
- assert_equal(ti1.freq_pointer, 2)
10
- assert_equal(ti1.prox_pointer, 3)
11
- assert_equal(ti1.skip_offset, 1)
12
- ti2 = ti1.clone()
13
- assert(ti1 == ti2)
14
- ti2 = TermInfo.new(10, 9, 8)
15
- assert(ti1 != ti2)
16
- ti2.set!(ti1)
17
- assert(ti1 == ti2)
18
- end
19
- end
@@ -1,192 +0,0 @@
1
- require File.dirname(__FILE__) + "/../../test_helper"
2
-
3
-
4
- class TermInfosIOTest < Test::Unit::TestCase
5
- include Ferret::Index
6
-
7
- DICT = [ "duad", "dual", "dualism", "dualist", "duality", "dualize", "duan",
8
- "duarchy", "dub", "dubber", "dubbin", "dubbing", "dubiety", "dubiosity",
9
- "dubious", "dubiously", "dubiousness", "dubitate", "dubitation", "dubnium",
10
- "dubonnet", "ducal", "ducat", "ducatoon", "duce", "duchess", "duchesse",
11
- "duchy", "duck", "duckbill", "duckboard", "ducker", "duckie", "ducking",
12
- "duckling", "duckpin", "duckshove", "duckshover", "ducktail", "duckwalk",
13
- "duckweed", "ducky", "duct", "ductile", "ductileness", "ductility",
14
- "ducting", "ductless", "ductule", "ductulus", "ductwork", "dud", "dudder",
15
- "duddery", "duddie", "duddy", "dude", "dudeen", "dudgeon", "due",
16
- "duecento", "duel", "dueler", "dueling", "duelist", "dueller", "duelling",
17
- "duellist", "duello", "duende", "dueness", "duenna", "duennaship", "duet",
18
- "duette", "duettino", "duettist", "duetto", "duff", "duffel", "duffer",
19
- "duffle", "dufus", "dug", "dugong", "dugout", "duiker", "duit", "duke",
20
- "dukedom", "dukeling", "dukery", "dukeship", "dulcamara", "dulcet",
21
- "dulcian", "dulciana", "dulcification", "dulcify", "dulcimer", "dulcimore",
22
- "dulcinea", "dulcitone", "dulcorate", "dule", "dulfer", "dulia", "dull",
23
- "dullard", "dullness", "dullsville", "dully", "dulness", "dulocracy",
24
- "dulosis", "dulse", "duly", "duma", "dumaist", "dumb", "dumbass",
25
- "dumbbell", "dumbcane", "dumbfound", "dumbfounder", "dumbhead",
26
- "dumbledore", "dumbly", "dumbness", "dumbo", "dumbstruck", "dumbwaiter",
27
- "dumdum", "dumfound", "dummerer", "dummkopf", "dummy", "dumortierite",
28
- "dump", "dumpbin", "dumpcart", "dumper", "dumpiness", "dumping",
29
- "dumpling", "dumplings", "dumpsite", "dumpster", "dumpy", "dun", "dunam",
30
- "dunce", "dunch", "dunder", "dunderhead", "dunderheadedness", "dunderpate",
31
- "dune", "duneland", "dunfish", "dung", "dungaree", "dungeon", "dungeoner",
32
- "dungheap", "dunghill", "dungy", "dunite", "duniwassal", "dunk", "dunker",
33
- "dunlin", "dunnage", "dunnakin", "dunness", "dunnite", "dunnock", "dunny",
34
- "dunt", "duo", "duodecillion", "duodecimal", "duodecimo", "duodenectomy",
35
- "duodenum", "duolog", "duologue", "duomo", "duopoly", "duopsony",
36
- "duotone", "dup", "dupability", "dupatta", "dupe", "duper", "dupery",
37
- "dupion", "duple", "duplet", "duplex", "duplexer", "duplexity",
38
- "duplicability", "duplicand", "duplicate", "duplication", "duplicator",
39
- "duplicature", "duplicitousness", "duplicity", "dupondius", "duppy",
40
- "dura", "durability", "durable", "durableness", "durably", "dural",
41
- "duralumin", "duramen", "durance", "duration", "durative", "durbar",
42
- "dure", "dures", "duress", "durgan", "durian", "durion", "durmast",
43
- "durn", "durned", "duro", "duroc", "durometer", "durr", "durra", "durrie",
44
- "durukuli", "durum", "durzi", "dusk", "duskiness", "dusky", "dust",
45
- "dustbin", "dustcart", "dustcloth", "dustcover", "duster", "dustheap",
46
- "dustiness", "dusting", "dustless", "dustman", "dustmop", "dustoff",
47
- "dustpan", "dustpanful", "dustrag", "dustsheet", "dustup", "dusty",
48
- "dutch", "dutchman", "duteous", "duteously", "duteousness", "dutiability",
49
- "dutiable", "dutifulness", "duty", "duumvir", "duumvirate", "duvet",
50
- "duvetine", "duvetyn", "duvetyne", "dux", "duyker"]
51
-
52
- TEST_SEGMENT = "_test"
53
-
54
- def setup()
55
- @dir = Ferret::Store::RAMDirectory.new
56
- end
57
-
58
- def tear_down()
59
- @dir.close()
60
- end
61
-
62
- def test_two_field_io
63
- term_dumbly = Term.new("word", "dumbly")
64
- term_dualize = Term.new("word", "dualize")
65
- term_rev_dualize = Term.new("reverse", "ezilaud")
66
-
67
- fis = FieldInfos.new
68
- fis.add("word", true, true)
69
- fis.add("reverse", true, true)
70
- terms = []
71
- term_infos = []
72
- tiw = TermInfosWriter.new(@dir, TEST_SEGMENT+"G", fis, 128)
73
-
74
- reverse_words = []
75
- DICT.each { |word| reverse_words << word.reverse }
76
- reverse_words.sort!
77
- reverse_words.each_with_index do |word, i|
78
- tiw.add(Term.new("reverse", word), TermInfo.new(1, i, i, 0))
79
- end
80
- DICT.each_with_index do |word, i|
81
- tiw.add(Term.new("word", word), TermInfo.new(1, 1000 + i, 1000 + i, 0))
82
- end
83
-
84
- tiw.close()
85
- tir = TermInfosReader.new(@dir, TEST_SEGMENT+"G", fis)
86
- assert_equal(564, tir.size)
87
- assert_equal(16, tir.skip_interval)
88
- assert_equal(561, tir.get_terms_position(Term.new("word", "duvetyne")))
89
- assert_equal(TermInfo.new(1, 1005, 1005, 0), tir.get_term_info(term_dualize))
90
- assert_equal(TermInfo.new(1, 70, 70, 0), tir.get_term_info(term_rev_dualize))
91
- end
92
-
93
- def test_io
94
- term_dumbly = Term.new("word", "dumbly")
95
- term_dualize = Term.new("word", "dualize")
96
-
97
- fis = FieldInfos.new
98
- fis.add("word", true, true)
99
- terms = []
100
- term_infos = []
101
- tiw = TermInfosWriter.new(@dir, TEST_SEGMENT, fis, 128)
102
- DICT.each_with_index do |word, i|
103
- terms << Term.new("word", word)
104
- term_infos << TermInfo.new(1, i, i, 0)
105
- tiw.add(terms[i], term_infos[i])
106
- end
107
- tiw.close()
108
- tir = TermInfosReader.new(@dir, TEST_SEGMENT, fis)
109
- assert_equal(282, tir.size)
110
- assert_equal(16, tir.skip_interval)
111
- assert_equal(281, tir.get_terms_position(Term.new("word", "duyker")))
112
- assert_equal(279, tir.get_terms_position(Term.new("word", "duvetyne")))
113
- assert_equal(254, tir.get_terms_position(Term.new("word", "dusting")))
114
- assert_equal(255, tir.get_terms_position(Term.new("word", "dustless")))
115
- assert_equal(256, tir.get_terms_position(Term.new("word", "dustman")))
116
- assert_equal(257, tir.get_terms_position(Term.new("word", "dustmop")))
117
- assert_equal(TermInfo.new(1, 5, 5, 0), tir.get_term_info(term_dualize))
118
- assert_equal(term_dumbly, tir.get_term(127))
119
- terms = tir.terms_from(term_dumbly)
120
- assert_equal(term_dumbly, terms.term)
121
- assert(terms.next?)
122
- assert_equal(Term.new("word", "dumbness"), terms.term)
123
- assert(terms.next?)
124
- assert_equal(Term.new("word", "dumbo"), terms.term)
125
- end
126
-
127
- def test_small_writer
128
- fis = FieldInfos.new
129
- fis.add("author", true, true)
130
- fis.add("title", true, true)
131
- tiw = TermInfosWriter.new(@dir, TEST_SEGMENT, fis, 128)
132
- terms = [ Term.new("author", "Martel"),
133
- Term.new("title", "Life of Pi"),
134
- Term.new("author", "Martin"),
135
- Term.new("title", "Life on the edge") ].sort
136
- term_infos = []
137
- 4.times {|i| term_infos << TermInfo.new(i,i,i,i)}
138
- 4.times {|i| tiw.add(terms[i], term_infos[i]) }
139
- tiw.close()
140
-
141
- tis_file = @dir.open_input(TEST_SEGMENT + ".tis")
142
- tii_file = @dir.open_input(TEST_SEGMENT + ".tii")
143
- assert_equal(TermInfosWriter::FORMAT, tis_file.read_int())
144
- assert_equal(4, tis_file.read_long()) # term count
145
- assert_equal(128, tis_file.read_int()) # @index_interval
146
- assert_equal(16, tis_file.read_int()) # @skip_interval
147
- assert_equal(0, tis_file.read_vint()) # string_equal length
148
- assert_equal(6, tis_file.read_vint()) # rest of string length
149
- tis_file.read_chars(author = "", 0, 6) # the difference string
150
- assert_equal("Martel", author.to_s)
151
- assert_equal(0, tis_file.read_vint()) # field number
152
- assert_equal(0, tis_file.read_vint()) # doc_freq
153
- assert_equal(0, tis_file.read_vlong()) # freq pointer difference
154
- assert_equal(0, tis_file.read_vlong()) # prox pointer difference
155
- assert_equal(4, tis_file.read_vint()) # string_equal length
156
- assert_equal(2, tis_file.read_vint()) # rest of string length
157
- tis_file.read_chars(author = "", 0, 2) # the difference string
158
- assert_equal("in", author.to_s)
159
- assert_equal(0, tis_file.read_vint()) # field number
160
- assert_equal(1, tis_file.read_vint()) # doc_freq
161
- assert_equal(1, tis_file.read_vlong()) # freq pointer difference
162
- assert_equal(1, tis_file.read_vlong()) # prox pointer difference
163
- assert_equal(0, tis_file.read_vint()) # string_equal length
164
- assert_equal(10, tis_file.read_vint()) # rest of string length
165
- tis_file.read_chars(title = "", 0, 10) # the difference string
166
- assert_equal("Life of Pi", title.to_s)
167
- assert_equal(1, tis_file.read_vint()) # field number
168
- assert_equal(2, tis_file.read_vint()) # doc_freq
169
- assert_equal(1, tis_file.read_vlong()) # freq pointer difference
170
- assert_equal(1, tis_file.read_vlong()) # prox pointer difference
171
- assert_equal(6, tis_file.read_vint()) # string_equal length
172
- assert_equal(10, tis_file.read_vint()) # rest of string length
173
- tis_file.read_chars(title = "", 0, 10) # the difference string
174
- assert_equal("n the edge", title.to_s)
175
- assert_equal(1, tis_file.read_vint()) # field number
176
- assert_equal(3, tis_file.read_vint()) # doc_freq
177
- assert_equal(1, tis_file.read_vlong()) # freq pointer difference
178
- assert_equal(1, tis_file.read_vlong()) # prox pointer difference
179
-
180
- assert_equal(TermInfosWriter::FORMAT, tii_file.read_int())
181
- assert_equal(1, tii_file.read_long())
182
- assert_equal(128, tii_file.read_int())
183
- assert_equal(16, tii_file.read_int())
184
- assert_equal(0, tii_file.read_vint()) # string_equal length
185
- assert_equal(0, tii_file.read_vint()) # rest of string length
186
- assert_equal(0xFFFFFFFF, tii_file.read_vint()) # field number
187
- assert_equal(0, tii_file.read_vint()) # doc_freq
188
- assert_equal(0, tii_file.read_vlong()) # freq pointer difference
189
- assert_equal(0, tii_file.read_vlong()) # prox pointer difference
190
- assert_equal(20, tii_file.read_vlong()) # pointer to first element in other
191
- end
192
- end