ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/MIT-LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2005 David Balmain
1
+ Copyright (c) 2005-2006 David Balmain
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README CHANGED
@@ -8,7 +8,7 @@ search for things in them later.
8
8
  == Requirements
9
9
 
10
10
  * Ruby 1.8
11
- * (C compiler to build the extension but not required to use Ferret)
11
+ * C compiler to build the extension. Tested with gcc, VC6 and VC2005
12
12
 
13
13
  == Installation
14
14
 
@@ -26,7 +26,7 @@ Run the following;
26
26
  $ rake ext
27
27
  $ ruby setup.rb config
28
28
  $ ruby setup.rb setup
29
- # sudo ruby setup.rb install
29
+ # ruby setup.rb install
30
30
 
31
31
  These simple steps install ferret in the default location of Ruby libraries.
32
32
  You can also install files into your favorite directory by supplying setup.rb
@@ -53,10 +53,8 @@ documentation.
53
53
  abilities of Ferret to present your data the best way you see fit.
54
54
 
55
55
  * Ferret::Document: to find out how to create documents. This part of Ferret
56
- is relatively straightforward. The main thing that we haven't gone into here
57
- is the use of term vectors. These allow you to store and retrieve the
58
- positions and offsets of the data which can be very useful in document
59
- comparison amoung other things. == More information
56
+ is relatively straightforward. If you know how Strings, Hashes and Arrays work
57
+ Ferret then you'll be able to create Documents.
60
58
 
61
59
  * Ferret::QueryParser: if you want to find out more about what you can do with
62
60
  Ferret's Query Parser, this is the place to look. The query parser is one
@@ -71,17 +69,8 @@ documentation.
71
69
 
72
70
  === Performance
73
71
 
74
- Currently Ferret is an order of magnitude slower than Java Lucene which can be
75
- quite a pain at times. I have written some basic C extensions which may or may
76
- not have installed when you installed Ferret. These double the speed but still
77
- leave it a lot slower than the Java version. I have, however, ported the
78
- indexing part of Java Lucene to C and it is an order of magnitude faster then
79
- the Java version. Once I'm pretty certain that the API of Ferret has settled
80
- and won't be changing much, I'll intergrate my C version. So expect to see
81
- Ferret running faster than Java Lucene some time in the future. If you'd like
82
- to try cferret and test my claims, let me know (if you haven't already found
83
- it in my subversion repository). It's not currently portable and will probably
84
- only run on linux.
72
+ We are unaware of any alternatives that can out-perform Ferret while still
73
+ matching it in features.
85
74
 
86
75
  == Contact
87
76
 
@@ -89,17 +78,16 @@ For bug reports and patches I have set up Trac here;
89
78
 
90
79
  http://ferret.davebalmain.com/trac
91
80
 
92
- Queries, discussion etc should be addressed to the forum or mailing lists hosted
93
- at;
81
+ Queries, discussion etc should be addressed to the mailing lists here;
94
82
 
95
83
  http://rubyforge.org/projects/ferret/
96
84
 
97
- Alternatively you could create a new page for discussion on the wiki at my Trac
98
- page above. Or, if you're shy, please feel free to email me directly at dbalmain@gmail.com
85
+ Alternatively you could create a new page for discussion on the Ferret wiki;
99
86
 
100
- Of course, since Ferret is almost a straight port of Java Lucene,
101
- everything said about Lucene at http://jakarta.apache.org/lucene/ should
102
- be true about Ferret. Apart from the bits about it being in Java.
87
+ http://ferret.davebalmain.com/trac
88
+
89
+ Of course, since Ferret was ported from Apache Lucene, most of what you can
90
+ do with Lucene you can also do with Ferret.
103
91
 
104
92
  == Authors
105
93
 
data/Rakefile CHANGED
@@ -8,8 +8,7 @@ require 'rake'
8
8
  require 'rake/testtask'
9
9
  require 'rake/rdoctask'
10
10
  require 'rake/clean'
11
- require 'rake_utils/code_statistics'
12
- require 'lib/rferret'
11
+ require 'ferret_version'
13
12
 
14
13
  begin
15
14
  require 'rubygems'
@@ -29,18 +28,18 @@ def announce(msg='')
29
28
  STDERR.puts msg
30
29
  end
31
30
 
32
- $VERBOSE = nil
33
-
34
31
  EXT = "ferret_ext.so"
35
- EXT_SRC = FileList["src/**/*.[ch]"]
36
- if (/mswin/ =~ RUBY_PLATFORM)
37
- EXT_SRC.delete('src/io/nix_io.c')
38
- end
32
+ EXT_SRC = FileList["../c/src/*.[c]", "../c/include/*.h",
33
+ "../c/lib/libstemmer_c/src_c/*.[ch]",
34
+ "../c/lib/libstemmer_c/runtime/*.[ch]",
35
+ "../c/lib/libstemmer_c/libstemmer/*.[ch]",
36
+ "../c/lib/libstemmer_c/include/libstemmer.h"]
39
37
 
40
38
  EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
41
39
  SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
42
40
 
43
- CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles', '.config'])
41
+ CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles',
42
+ '.config', 'ext/cferret.c'])
44
43
  CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
45
44
  POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
46
45
 
@@ -49,69 +48,53 @@ task :polish => [:clean] do
49
48
  POLISH.each { |fn| rm_r fn rescue nil }
50
49
  end
51
50
 
51
+ desc "Run tests with Valgrind"
52
+ task :valgrind do
53
+ sh "valgrind --gen-suppressions=yes --suppressions=ferret_valgrind.supp " +
54
+ "--leak-check=yes --show-reachable=yes -v ruby test/test_all.rb"
55
+ #sh "valgrind --suppressions=ferret_valgrind.supp " +
56
+ # "--leak-check=yes --show-reachable=yes -v ruby test/unit/index/tc_index_reader.rb"
57
+ end
58
+
52
59
  task :default => :test_all
53
- desc "Run all tests"
54
- task :test_all => [ :test_runits, :test_cunits, :test_functional ]
60
+ #task :default => :ext do
61
+ # sh "ruby test/unit/index/tc_index.rb"
62
+ #end
55
63
 
56
- desc "Generate API documentation, and show coding stats"
57
- task :doc => [ :stats, :appdoc ]
64
+ desc "Run all tests"
65
+ task :test_all => [ :test_units ]
58
66
 
59
- desc "run unit tests in test/unit for pure ruby ferret"
60
- Rake::TestTask.new("test_runits" => :parsers) do |t|
61
- t.ruby_opts = ["-r 'lib/rferret'"]
62
- t.libs << "test/unit"
63
- t.pattern = 'test/unit/ts_*.rb'
64
- t.verbose = true
65
- end
66
-
67
- desc "run unit tests in test/unit for C ferret"
68
- Rake::TestTask.new("test_cunits" => :ext) do |t|
69
- t.libs << "test/unit"
70
- t.pattern = 'test/unit/ts_*.rb'
71
- t.verbose = true
72
- end
67
+ desc "Generate API documentation"
68
+ task :doc => [ :appdoc ]
73
69
 
74
70
  desc "run unit tests in test/unit"
75
- Rake::TestTask.new("test_long") do |t|
76
- t.libs << "test"
71
+ Rake::TestTask.new("test_units" => :ext) do |t|
77
72
  t.libs << "test/unit"
78
- t.test_files = FileList["test/longrunning/tm_store.rb"]
79
73
  t.pattern = 'test/unit/t[cs]_*.rb'
74
+ #t.pattern = 'test/unit/search/tc_index_searcher.rb'
80
75
  t.verbose = true
81
76
  end
82
77
 
83
- desc "run funtional tests in test/funtional"
84
- Rake::TestTask.new("test_functional") do |t|
85
- t.libs << "test"
86
- t.pattern = 'test/funtional/tc_*.rb'
87
- t.verbose = true
88
- end
89
-
90
- desc "Report code statistics (KLOCS, etc) from application"
91
- task :stats do
92
- CodeStatistics.new(
93
- ["Ferret", "lib/ferret"],
94
- ["Units", "test/unit"],
95
- ["Units-extended", "test/longrunning"]
96
- ).to_s
97
- end
98
-
99
78
  desc "Generate documentation for the application"
100
79
  rd = Rake::RDocTask.new("appdoc") do |rdoc|
101
80
  rdoc.rdoc_dir = 'doc/api'
102
81
  rdoc.title = "Ferret Search Library Documentation"
103
- rdoc.options << '--line-numbers --inline-source'
82
+ rdoc.options << '--line-numbers'
83
+ rdoc.options << '--inline-source'
84
+ rdoc.options << '--charset=utf-8'
104
85
  rdoc.rdoc_files.include('README')
105
86
  rdoc.rdoc_files.include('TODO')
106
87
  rdoc.rdoc_files.include('TUTORIAL')
107
88
  rdoc.rdoc_files.include('MIT-LICENSE')
108
89
  rdoc.rdoc_files.include('lib/**/*.rb')
90
+ rdoc.rdoc_files.include('ext/r_*.c')
91
+ rdoc.rdoc_files.include('ext/ferret.c')
109
92
  end
110
93
 
111
94
  EXT_SRC.each do |fn|
112
95
  dest_fn = File.join("ext", File.basename(fn))
113
96
  file dest_fn => fn do |t|
114
- cp fn, dest_fn
97
+ ln_s File.join("..", fn), dest_fn
115
98
  if fn =~ /stemmer/
116
99
  # flatten the directory structure for lib_stemmer
117
100
  open(dest_fn) do |in_f|
@@ -129,7 +112,7 @@ task :ext => ["ext/#{EXT}"] + SRC
129
112
 
130
113
  file "ext/#{EXT}" => ["ext/Makefile"] do
131
114
  cp "ext/inc/lang.h", "ext/lang.h"
132
- cp "ext/inc/except.h", "ext/except.h"
115
+ cp "ext/inc/threading.h", "ext/threading.h"
133
116
  cd "ext"
134
117
  if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
135
118
  sh "nmake"
@@ -140,10 +123,12 @@ file "ext/#{EXT}" => ["ext/Makefile"] do
140
123
  end
141
124
 
142
125
  file "ext/lang.h" => ["ext/inc/lang.h"] do
126
+ rm_f "ext/lang.h"
143
127
  cp "ext/inc/lang.h", "ext/lang.h"
144
128
  end
145
- file "ext/except.h" => ["ext/inc/except.h"] do
146
- cp "ext/inc/except.h", "ext/except.h"
129
+ file "ext/threading.h" => ["ext/inc/threading.h"] do
130
+ rm_f "ext/threading.h"
131
+ cp "ext/inc/threading.h", "ext/threading.h"
147
132
  end
148
133
 
149
134
  file "ext/Makefile" => SRC do
@@ -175,7 +160,6 @@ PKG_FILES = FileList[
175
160
  'Rakefile'
176
161
  ]
177
162
  PKG_FILES.exclude('**/*.o')
178
- PKG_FILES.include('ext/termdocs.c')
179
163
  PKG_FILES.exclude('**/Makefile')
180
164
  PKG_FILES.exclude('ext/ferret_ext.so')
181
165
 
@@ -213,6 +197,7 @@ else
213
197
  s.require_path = 'lib' # Use these for libraries.
214
198
  s.autorequire = 'ferret'
215
199
 
200
+
216
201
  #s.bindir = "bin" # Use these for applications.
217
202
  #s.executables = ["rake"]
218
203
  #s.default_executable = "rake"
@@ -319,11 +304,10 @@ task :update_version => [:prerelease] do
319
304
  else
320
305
  announce "Updating Ferret version to #{PKG_VERSION}"
321
306
  reversion("lib/ferret.rb")
322
- reversion("lib/rferret.rb")
323
307
  if ENV['RELTEST']
324
308
  announce "Release Task Testing, skipping commiting of new version"
325
309
  else
326
- sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
310
+ sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
327
311
  end
328
312
  end
329
313
  end
data/TODO CHANGED
@@ -1,17 +1,14 @@
1
- = Ferret Project -- To Do List
2
-
3
- Send suggestions for this list to mailto:dbalmain@gmail.com
4
-
5
- === To Do
6
-
7
- * Make a dll for people on Windows
8
- * fix rb_obj_as_string to StringValue()
9
- * pure ruby ConstantScoreQuery
10
-
11
- === Done
12
-
13
- * Add the ability to persist an in memory index to Ferret::Index::Index
14
- * Add UTF-8 support
15
- * Multi Field Query
16
- * Test threading
17
- * Compile a proper dummy executable
1
+ = TODO
2
+
3
+ * user defined sorting
4
+ * add field compression
5
+ * Fix highlighting to work for compressed fields
6
+ * Fix highlighting to work for external fields
7
+ * Add Ferret::Index::Index
8
+
9
+ = Done
10
+ * Add string Sort descripter
11
+ * fix memory bug
12
+ * add MultiReader interface
13
+ * add lexicographical sort (byte sort)
14
+ * Add highlighting
data/ext/analysis.c CHANGED
@@ -1,90 +1,95 @@
1
1
  #include "analysis.h"
2
2
  #include "hash.h"
3
- #include "libstemmer.h"
3
+ #include <libstemmer.h>
4
4
  #include <string.h>
5
5
  #include <ctype.h>
6
6
  #include <wctype.h>
7
7
  #include <wchar.h>
8
8
 
9
-
10
9
  /****************************************************************************
11
10
  *
12
11
  * Token
13
12
  *
14
13
  ****************************************************************************/
15
14
 
16
- Token *tk_create()
15
+ inline Token *tk_set(Token *tk,
16
+ char *text, int tlen, int start, int end, int pos_inc)
17
17
  {
18
- return ALLOC(Token);
19
- }
20
-
21
- void tk_destroy(void *p)
22
- {
23
- free(p);
18
+ if (tlen >= MAX_WORD_SIZE) {
19
+ tlen = MAX_WORD_SIZE - 1;
20
+ }
21
+ memcpy(tk->text, text, sizeof(char) * tlen);
22
+ tk->text[tlen] = '\0';
23
+ tk->len = tlen;
24
+ tk->start = start;
25
+ tk->end = end;
26
+ tk->pos_inc = pos_inc;
27
+ return tk;
24
28
  }
25
29
 
26
- inline Token *tk_set(Token *tk,
27
- char *text,
28
- int tlen,
29
- int start,
30
- int end,
31
- int pos_inc)
30
+ inline Token *tk_set_ts(Token *tk,
31
+ char *start, char *end, char *text, int pos_inc)
32
32
  {
33
- if (tlen >= MAX_WORD_SIZE) {
34
- tlen = MAX_WORD_SIZE - 1;
35
- }
36
- memcpy(tk->text, text, sizeof(char) * tlen);
37
- tk->text[tlen] = '\0';
38
- tk->start = start;
39
- tk->end = end;
40
- tk->pos_inc = pos_inc;
41
- return tk;
33
+ return tk_set(tk, start, (int)(end - start),
34
+ (int)(start - text), (int)(end - text), pos_inc);
42
35
  }
43
36
 
44
- inline Token *tk_set_ts(Token *tk,
45
- char *start,
46
- char *end,
47
- char *text,
48
- int pos_inc)
37
+ inline Token *tk_set_no_len(Token *tk,
38
+ char *text, int start, int end, int pos_inc)
49
39
  {
50
- return tk_set(tk, start, (int)(end - start),
51
- (int)(start - text), (int)(end - text), pos_inc);
40
+ return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
52
41
  }
53
42
 
54
- inline Token *tk_set_no_len(Token *tk,
55
- char *text,
56
- int start,
57
- int end,
58
- int pos_inc)
43
+ inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
44
+ int pos_inc)
59
45
  {
60
- return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
46
+ int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
47
+ tk->text[len] = '\0';
48
+ tk->len = len;
49
+ tk->start = start;
50
+ tk->end = end;
51
+ tk->pos_inc = pos_inc;
52
+ return tk;
61
53
  }
62
54
 
63
55
  int tk_eq(Token *tk1, Token *tk2)
64
56
  {
65
- return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
66
- tk1->start == tk2->start && tk1->end == tk2->end);
57
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
58
+ tk1->start == tk2->start && tk1->end == tk2->end);
67
59
  }
68
60
 
69
61
  int tk_cmp(Token *tk1, Token *tk2)
70
62
  {
71
- int cmp;
72
- if (tk1->start > tk2->start) {
73
- cmp = 1;
74
- } else if (tk1->start < tk2->start) {
75
- cmp = -1;
76
- } else {
77
- if (tk1->end > tk2->end) {
78
- cmp = 1;
79
- } else if (tk1->end < tk2->end) {
80
- cmp = -1;
81
- } else {
82
- cmp = strcmp((char *)tk1->text, (char *)tk2->text);
63
+ int cmp;
64
+ if (tk1->start > tk2->start) {
65
+ cmp = 1;
66
+ }
67
+ else if (tk1->start < tk2->start) {
68
+ cmp = -1;
83
69
  }
84
- }
85
- return cmp;
70
+ else {
71
+ if (tk1->end > tk2->end) {
72
+ cmp = 1;
73
+ }
74
+ else if (tk1->end < tk2->end) {
75
+ cmp = -1;
76
+ }
77
+ else {
78
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
79
+ }
80
+ }
81
+ return cmp;
82
+ }
83
+
84
+ void tk_destroy(void *p)
85
+ {
86
+ free(p);
86
87
  }
87
88
 
89
+ Token *tk_new()
90
+ {
91
+ return ALLOC(Token);
92
+ }
88
93
 
89
94
  /****************************************************************************
90
95
  *
@@ -92,92 +97,94 @@ int tk_cmp(Token *tk1, Token *tk2)
92
97
  *
93
98
  ****************************************************************************/
94
99
 
95
- void ts_deref(void *p)
100
+ void ts_deref(TokenStream *ts)
96
101
  {
97
- TokenStream *ts = (TokenStream *)p;
98
- if (--ts->ref_cnt <= 0) ts->destroy(ts);
102
+ if (--ts->ref_cnt <= 0) {
103
+ ts->destroy_i(ts);
104
+ }
99
105
  }
100
106
 
101
- void ts_standard_destroy(TokenStream *ts)
107
+ static TokenStream *ts_reset(TokenStream *ts, char *text)
102
108
  {
103
- tk_destroy(ts->token);
104
- free(ts);
109
+ ts->t = ts->text = text;
110
+ return ts;
105
111
  }
106
112
 
107
- void ts_reset(TokenStream *ts, char *text)
113
+ TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
108
114
  {
109
- ts->t = ts->text = text;
115
+ TokenStream *ts = (TokenStream *)ecalloc(size);
116
+ memcpy(ts, orig_ts, size);
117
+ ts->ref_cnt = 1;
118
+ return ts;
110
119
  }
111
120
 
112
- TokenStream *ts_create()
121
+ TokenStream *ts_new_i(size_t size)
113
122
  {
114
- TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
115
- ts->token = tk_create();
116
- ts->destroy = &ts_standard_destroy;
117
- ts->reset = &ts_reset;
118
- ts->ref_cnt = 1;
119
- return ts;
123
+ TokenStream *ts = ecalloc(size);
124
+
125
+ ts->destroy_i = (void (*)(TokenStream *))&free;
126
+ ts->reset = &ts_reset;
127
+ ts->ref_cnt = 1;
128
+
129
+ return ts;
120
130
  }
121
131
 
122
- TokenStream *ts_clone(TokenStream *orig_ts)
132
+ /****************************************************************************
133
+ * CachedTokenStream
134
+ ****************************************************************************/
135
+
136
+ #define CTS(token_stream) ((CachedTokenStream *)(token_stream))
137
+
138
+ static TokenStream *cts_clone_i(TokenStream *orig_ts)
123
139
  {
124
- TokenStream *ts = ALLOC(TokenStream);
125
- memcpy(ts, orig_ts, sizeof(TokenStream));
126
- if (orig_ts->token) {
127
- ts->token = ALLOC(Token);
128
- memcpy(ts->token, orig_ts->token, sizeof(Token));
129
- }
130
- if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
131
- if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
132
- ts->ref_cnt = 1;
133
- return ts;
140
+ return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
134
141
  }
135
142
 
136
- /* * Multi-byte TokenStream * */
137
- static char * const ENC_ERR_MSG = "Error decoding input string. "
138
- "Check that you have the locale set correctly";
139
- #define MB_NEXT_CHAR \
140
- if ((i = (int)mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
141
- RAISE(IO_ERROR, ENC_ERR_MSG)
142
-
143
- inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
143
+ static TokenStream *cts_new()
144
144
  {
145
- tk->text[wcstombs(tk->text, text, MAX_WORD_SIZE - 1)] = '\0';
146
- tk->start = start;
147
- tk->end = end;
148
- tk->pos_inc = pos_inc;
149
- return tk;
145
+ TokenStream *ts = ts_new(CachedTokenStream);
146
+ ts->clone_i = &cts_clone_i;
147
+ return ts;
150
148
  }
151
149
 
152
- void mb_ts_standard_destroy(TokenStream *ts)
150
+ /* * Multi-byte TokenStream * */
151
+
152
+ #define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
153
+
154
+ inline int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
153
155
  {
154
- tk_destroy(ts->token);
155
- free(ts->data);
156
- free(ts);
156
+ int num_bytes;
157
+ if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
158
+ const char *t = s;
159
+ do {
160
+ t++;
161
+ ZEROSET(state, mbstate_t);
162
+ num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
163
+ } while ((num_bytes < 0) && (*wchr != 0) && (*t != 0));
164
+ num_bytes += t - s;
165
+ }
166
+ return num_bytes;
157
167
  }
158
168
 
159
- void mb_ts_reset(TokenStream *ts, char *text)
169
+ static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
160
170
  {
161
- ZEROSET(ts->data, mbstate_t, 1);
162
- ts_reset(ts, text);
171
+ ZEROSET(&(MBTS(ts)->state), mbstate_t);
172
+ ts_reset(ts, text);
173
+ return ts;
163
174
  }
164
175
 
165
- void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
176
+ static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
166
177
  {
167
- new_ts->data = ALLOC(mbstate_t);
168
- memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
178
+ return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
169
179
  }
170
180
 
171
- TokenStream *mb_ts_create()
181
+ TokenStream *mb_ts_new()
172
182
  {
173
- TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
174
- ts->data = ALLOC(mbstate_t);
175
- ts->token = tk_create();
176
- ts->destroy = &mb_ts_standard_destroy;
177
- ts->reset = &mb_ts_reset;
178
- ts->clone_i = &mb_ts_clone_i;
179
- ts->ref_cnt = 1;
180
- return ts;
183
+ TokenStream *ts = ts_new(MultiByteTokenStream);
184
+ ts->reset = &mb_ts_reset;
185
+ ts->clone_i = &mb_ts_clone_i;
186
+ ts->ref_cnt = 1;
187
+ return ts;
181
188
  }
182
189
 
183
190
  /****************************************************************************
@@ -186,35 +193,40 @@ TokenStream *mb_ts_create()
186
193
  *
187
194
  ****************************************************************************/
188
195
 
189
- void a_deref(void *p)
196
+ void a_deref(Analyzer *a)
190
197
  {
191
- Analyzer *a = (Analyzer *)p;
192
- if (--a->ref_cnt <= 0) a->destroy(a);
198
+ if (--a->ref_cnt <= 0) {
199
+ a->destroy_i(a);
200
+ }
193
201
  }
194
202
 
195
- void a_standard_destroy(Analyzer *a)
203
+ static void a_standard_destroy_i(Analyzer *a)
196
204
  {
197
- if (a->current_ts) ts_deref(a->current_ts);
198
- free(a);
205
+ if (a->current_ts) {
206
+ ts_deref(a->current_ts);
207
+ }
208
+ free(a);
199
209
  }
200
210
 
201
- TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
211
+ static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
202
212
  {
203
- a->current_ts->reset(a->current_ts, text);
204
- return a->current_ts;
213
+ TokenStream *ts;
214
+ (void)field;
215
+ ts = ts_clone(a->current_ts);
216
+ return ts->reset(ts, text);
205
217
  }
206
218
 
207
- Analyzer *analyzer_create(void *data, TokenStream *ts,
208
- void (*destroy)(Analyzer *a),
209
- TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
219
+ Analyzer *analyzer_new(TokenStream *ts,
220
+ void (*destroy_i)(Analyzer *a),
221
+ TokenStream *(*get_ts)(Analyzer *a, char *field,
222
+ char *text))
210
223
  {
211
- Analyzer *a = ALLOC(Analyzer);
212
- a->data = data;
213
- a->current_ts = ts;
214
- a->destroy = (destroy ? destroy : &a_standard_destroy);
215
- a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
216
- a->ref_cnt = 1;
217
- return a;
224
+ Analyzer *a = ALLOC(Analyzer);
225
+ a->current_ts = ts;
226
+ a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
227
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
228
+ a->ref_cnt = 1;
229
+ return a;
218
230
  }
219
231
 
220
232
  /****************************************************************************
@@ -226,120 +238,132 @@ Analyzer *analyzer_create(void *data, TokenStream *ts,
226
238
  /*
227
239
  * WhitespaceTokenizer
228
240
  */
229
- Token *wst_next(TokenStream *ts)
241
+ static Token *wst_next(TokenStream *ts)
230
242
  {
231
- char *t = ts->t;
232
- char *start;
243
+ char *t = ts->t;
244
+ char *start;
233
245
 
234
- while (*t != '\0' && isspace(*t)) t++;
246
+ while (*t != '\0' && isspace(*t)) {
247
+ t++;
248
+ }
235
249
 
236
- if (*t == '\0') return NULL;
250
+ if (*t == '\0') {
251
+ return NULL;
252
+ }
237
253
 
238
- start = t;
239
- while (*t != '\0' && !isspace(*t)) t++;
254
+ start = t;
255
+ while (*t != '\0' && !isspace(*t)) {
256
+ t++;
257
+ }
240
258
 
241
- ts->t = t;
242
- tk_set_ts(ts->token, start, t, ts->text, 1);
243
- return ts->token;
259
+ ts->t = t;
260
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
244
261
  }
245
262
 
246
- TokenStream *whitespace_tokenizer_create()
263
+ TokenStream *whitespace_tokenizer_new()
247
264
  {
248
- TokenStream *ts = ts_create();
249
- ts->next = &wst_next;
250
- return ts;
265
+ TokenStream *ts = cts_new();
266
+ ts->next = &wst_next;
267
+ return ts;
251
268
  }
252
269
 
253
270
  /*
254
271
  * Multi-byte WhitespaceTokenizer
255
272
  */
256
- Token *mb_wst_next(TokenStream *ts)
257
- {
258
- int i;
259
- char *start;
260
- char *t = ts->t;
261
- wchar_t wchr;
273
+ static Token *mb_wst_next(TokenStream *ts)
274
+ {
275
+ int i;
276
+ char *start;
277
+ char *t = ts->t;
278
+ wchar_t wchr;
279
+ mbstate_t *state = &(MBTS(ts)->state);
280
+
281
+ i = mb_next_char(&wchr, t, state);
282
+ while (wchr != 0 && iswspace(wchr)) {
283
+ t += i;
284
+ i = mb_next_char(&wchr, t, state);
285
+ }
286
+ if (wchr == 0) {
287
+ return NULL;
288
+ }
262
289
 
263
- MB_NEXT_CHAR;
264
- while (wchr != 0 && iswspace(wchr)) {
290
+ start = t;
265
291
  t += i;
266
- MB_NEXT_CHAR;
267
- }
268
- if (wchr == 0) return NULL;
269
-
270
- start = t;
271
- t += i;
272
- MB_NEXT_CHAR;
273
- while (wchr != 0 && !iswspace(wchr)) {
274
- t += i;
275
- MB_NEXT_CHAR;
276
- }
277
- tk_set_ts(ts->token, start, t, ts->text, 1);
278
- ts->t = t;
279
- return ts->token;
292
+ i = mb_next_char(&wchr, t, state);
293
+ while (wchr != 0 && !iswspace(wchr)) {
294
+ t += i;
295
+ i = mb_next_char(&wchr, t, state);
296
+ }
297
+ ts->t = t;
298
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
280
299
  }
281
300
 
282
301
  /*
283
302
  * Lowercasing Multi-byte WhitespaceTokenizer
284
303
  */
285
- Token *mb_wst_next_lc(TokenStream *ts)
286
- {
287
- int i;
288
- char *start;
289
- char *t = ts->t;
290
- wchar_t wchr;
291
- wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
292
-
293
- w = wbuf;
294
- w_end = &wbuf[MAX_WORD_SIZE];
304
+ static Token *mb_wst_next_lc(TokenStream *ts)
305
+ {
306
+ int i;
307
+ char *start;
308
+ char *t = ts->t;
309
+ wchar_t wchr;
310
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
311
+ mbstate_t *state = &(MBTS(ts)->state);
312
+
313
+ w = wbuf;
314
+ w_end = &wbuf[MAX_WORD_SIZE];
315
+
316
+ i = mb_next_char(&wchr, t, state);
317
+ while (wchr != 0 && iswspace(wchr)) {
318
+ t += i;
319
+ i = mb_next_char(&wchr, t, state);
320
+ }
321
+ if (wchr == 0) {
322
+ return NULL;
323
+ }
295
324
 
296
- MB_NEXT_CHAR;
297
- while (wchr != 0 && iswspace(wchr)) {
325
+ start = t;
298
326
  t += i;
299
- MB_NEXT_CHAR;
300
- }
301
- if (wchr == 0) return NULL;
302
-
303
- start = t;
304
- t += i;
305
- *w++ = towlower(wchr);
306
- MB_NEXT_CHAR;
307
- while (wchr != 0 && !iswspace(wchr)) {
308
- if (w < w_end) *w++ = towlower(wchr);
309
- t += i;
310
- MB_NEXT_CHAR;
311
- }
312
- *w = 0;
313
- w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
314
- ts->t = t;
315
- return ts->token;
327
+ *w++ = towlower(wchr);
328
+ i = mb_next_char(&wchr, t, state);
329
+ while (wchr != 0 && !iswspace(wchr)) {
330
+ if (w < w_end) {
331
+ *w++ = towlower(wchr);
332
+ }
333
+ t += i;
334
+ i = mb_next_char(&wchr, t, state);
335
+ }
336
+ *w = 0;
337
+ ts->t = t;
338
+ return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
339
+ (int)(t - ts->text), 1);
316
340
  }
317
341
 
318
- TokenStream *mb_whitespace_tokenizer_create(bool lowercase)
342
+ TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
319
343
  {
320
- TokenStream *ts = mb_ts_create();
321
- ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
322
- return ts;
344
+ TokenStream *ts = mb_ts_new();
345
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
346
+ return ts;
323
347
  }
324
348
 
325
349
  /*
326
350
  * WhitespaceAnalyzers
327
351
  */
328
- Analyzer *whitespace_analyzer_create(bool lowercase)
352
+ Analyzer *whitespace_analyzer_new(bool lowercase)
329
353
  {
330
- TokenStream *ts;
331
- if (lowercase) {
332
- ts = lowercase_filter_create(whitespace_tokenizer_create());
333
- } else {
334
- ts = whitespace_tokenizer_create();
335
- }
336
- return analyzer_create(NULL, ts, NULL, NULL);
354
+ TokenStream *ts;
355
+ if (lowercase) {
356
+ ts = lowercase_filter_new(whitespace_tokenizer_new());
357
+ }
358
+ else {
359
+ ts = whitespace_tokenizer_new();
360
+ }
361
+ return analyzer_new(ts, NULL, NULL);
337
362
  }
338
363
 
339
- Analyzer *mb_whitespace_analyzer_create(bool lowercase)
364
+ Analyzer *mb_whitespace_analyzer_new(bool lowercase)
340
365
  {
341
- return analyzer_create(NULL, mb_whitespace_tokenizer_create(lowercase),
342
- NULL, NULL);
366
+ return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
343
367
  }
344
368
 
345
369
  /****************************************************************************
@@ -353,26 +377,31 @@ Analyzer *mb_whitespace_analyzer_create(bool lowercase)
353
377
  */
354
378
  Token *lt_next(TokenStream *ts)
355
379
  {
356
- char *start;
357
- char *t = ts->t;
380
+ char *start;
381
+ char *t = ts->t;
358
382
 
359
- while (*t != '\0' && !isalpha(*t)) t++;
383
+ while (*t != '\0' && !isalpha(*t)) {
384
+ t++;
385
+ }
360
386
 
361
- if (*t == '\0') return NULL;
387
+ if (*t == '\0') {
388
+ return NULL;
389
+ }
362
390
 
363
- start = t;
364
- while (*t != '\0' && isalpha(*t)) t++;
391
+ start = t;
392
+ while (*t != '\0' && isalpha(*t)) {
393
+ t++;
394
+ }
365
395
 
366
- tk_set_ts(ts->token, start, t, ts->text, 1);
367
- ts->t = t;
368
- return ts->token;
396
+ ts->t = t;
397
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
369
398
  }
370
399
 
371
- TokenStream *letter_tokenizer_create()
400
+ TokenStream *letter_tokenizer_new()
372
401
  {
373
- TokenStream *ts = ts_create();
374
- ts->next = &lt_next;
375
- return ts;
402
+ TokenStream *ts = cts_new();
403
+ ts->next = &lt_next;
404
+ return ts;
376
405
  }
377
406
 
378
407
  /*
@@ -380,28 +409,31 @@ TokenStream *letter_tokenizer_create()
380
409
  */
381
410
  Token *mb_lt_next(TokenStream *ts)
382
411
  {
383
- int i;
384
- char *start;
385
- char *t = ts->t;
386
- wchar_t wchr;
412
+ int i;
413
+ char *start;
414
+ char *t = ts->t;
415
+ wchar_t wchr;
416
+ mbstate_t *state = &(MBTS(ts)->state);
387
417
 
388
- MB_NEXT_CHAR;
389
- while (wchr != 0 && !iswalpha(wchr)) {
390
- t += i;
391
- MB_NEXT_CHAR;
392
- }
393
- if (wchr == 0) return NULL;
394
-
395
- start = t;
396
- t += i;
397
- MB_NEXT_CHAR;
398
- while (wchr != 0 && iswalpha(wchr)) {
418
+ i = mb_next_char(&wchr, t, state);
419
+ while (wchr != 0 && !iswalpha(wchr)) {
420
+ t += i;
421
+ i = mb_next_char(&wchr, t, state);
422
+ }
423
+
424
+ if (wchr == 0) {
425
+ return NULL;
426
+ }
427
+
428
+ start = t;
399
429
  t += i;
400
- MB_NEXT_CHAR;
401
- }
402
- tk_set_ts(ts->token, start, t, ts->text, 1);
403
- ts->t = t;
404
- return ts->token;
430
+ i = mb_next_char(&wchr, t, state);
431
+ while (wchr != 0 && iswalpha(wchr)) {
432
+ t += i;
433
+ i = mb_next_char(&wchr, t, state);
434
+ }
435
+ ts->t = t;
436
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
405
437
  }
406
438
 
407
439
  /*
@@ -409,62 +441,67 @@ Token *mb_lt_next(TokenStream *ts)
409
441
  */
410
442
  Token *mb_lt_next_lc(TokenStream *ts)
411
443
  {
412
- int i;
413
- char *start;
414
- char *t = ts->t;
415
- wchar_t wchr;
416
- wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
444
+ int i;
445
+ char *start;
446
+ char *t = ts->t;
447
+ wchar_t wchr;
448
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
449
+ mbstate_t *state = &(MBTS(ts)->state);
417
450
 
418
- w = wbuf;
419
- w_end = &wbuf[MAX_WORD_SIZE];
451
+ w = wbuf;
452
+ w_end = &wbuf[MAX_WORD_SIZE];
420
453
 
421
- MB_NEXT_CHAR;
422
- while (wchr != 0 && !iswalpha(wchr)) {
423
- t += i;
424
- MB_NEXT_CHAR;
425
- }
426
- if (wchr == 0) return NULL;
427
-
428
- start = t;
429
- t += i;
430
- *w++ = towlower(wchr);
431
- MB_NEXT_CHAR;
432
- while (wchr != 0 && iswalpha(wchr)) {
433
- if (w < w_end) *w++ = towlower(wchr);
454
+ i = mb_next_char(&wchr, t, state);
455
+ while (wchr != 0 && !iswalpha(wchr)) {
456
+ t += i;
457
+ i = mb_next_char(&wchr, t, state);
458
+ }
459
+ if (wchr == 0) {
460
+ return NULL;
461
+ }
462
+
463
+ start = t;
434
464
  t += i;
435
- MB_NEXT_CHAR;
436
- }
437
- *w = 0;
438
- w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
439
- ts->t = t;
440
- return ts->token;
465
+ *w++ = towlower(wchr);
466
+ i = mb_next_char(&wchr, t, state);
467
+ while (wchr != 0 && iswalpha(wchr)) {
468
+ if (w < w_end) {
469
+ *w++ = towlower(wchr);
470
+ }
471
+ t += i;
472
+ i = mb_next_char(&wchr, t, state);
473
+ }
474
+ *w = 0;
475
+ ts->t = t;
476
+ return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
477
+ (int)(t - ts->text), 1);
441
478
  }
442
479
 
443
- TokenStream *mb_letter_tokenizer_create(bool lowercase)
480
+ TokenStream *mb_letter_tokenizer_new(bool lowercase)
444
481
  {
445
- TokenStream *ts = mb_ts_create();
446
- ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
447
- return ts;
482
+ TokenStream *ts = mb_ts_new();
483
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
484
+ return ts;
448
485
  }
449
486
 
450
487
  /*
451
488
  * LetterAnalyzers
452
489
  */
453
- Analyzer *letter_analyzer_create(bool lowercase)
490
+ Analyzer *letter_analyzer_new(bool lowercase)
454
491
  {
455
- TokenStream *ts;
456
- if (lowercase) {
457
- ts = lowercase_filter_create(letter_tokenizer_create());
458
- } else {
459
- ts = letter_tokenizer_create();
460
- }
461
- return analyzer_create(NULL, ts, NULL, NULL);
492
+ TokenStream *ts;
493
+ if (lowercase) {
494
+ ts = lowercase_filter_new(letter_tokenizer_new());
495
+ }
496
+ else {
497
+ ts = letter_tokenizer_new();
498
+ }
499
+ return analyzer_new(ts, NULL, NULL);
462
500
  }
463
501
 
464
- Analyzer *mb_letter_analyzer_create(bool lowercase)
502
+ Analyzer *mb_letter_analyzer_new(bool lowercase)
465
503
  {
466
- return analyzer_create(NULL,
467
- mb_letter_tokenizer_create(lowercase), NULL, NULL);
504
+ return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
468
505
  }
469
506
 
470
507
  /****************************************************************************
@@ -473,115 +510,146 @@ Analyzer *mb_letter_analyzer_create(bool lowercase)
473
510
  *
474
511
  ****************************************************************************/
475
512
 
513
+ #define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
514
+
476
515
  /*
477
516
  * StandardTokenizer
478
517
  */
479
- int std_get_alpha(TokenStream *ts, char *token)
518
+ static int std_get_alpha(TokenStream *ts, char *token)
480
519
  {
481
- int i = 0;
482
- char *t = ts->t;
483
- while (t[i] != '\0' && isalpha(t[i])) {
484
- if (i < MAX_WORD_SIZE) token[i] = t[i];
485
- i++;
486
- }
487
- return i;
520
+ int i = 0;
521
+ char *t = ts->t;
522
+ while (t[i] != '\0' && isalpha(t[i])) {
523
+ if (i < MAX_WORD_SIZE) {
524
+ token[i] = t[i];
525
+ }
526
+ i++;
527
+ }
528
+ return i;
488
529
  }
489
530
 
490
- int mb_std_get_alpha(TokenStream *ts, char *token)
531
+ static int mb_std_get_alpha(TokenStream *ts, char *token)
491
532
  {
492
- char *t = ts->t;
493
- wchar_t w;
494
- int i;
495
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
496
- while (w != 0 && iswalpha(w)) {
497
- t += i;
498
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
499
- }
533
+ char *t = ts->t;
534
+ wchar_t wchr;
535
+ int i;
536
+ mbstate_t state; ZEROSET(&state, mbstate_t);
537
+
538
+ i = mb_next_char(&wchr, t, &state);
500
539
 
501
- i = (int)(t - ts->t);
502
- if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
503
- memcpy(token, ts->t, i);
504
- return i;
540
+ while (wchr != 0 && iswalpha(wchr)) {
541
+ t += i;
542
+ i = mb_next_char(&wchr, t, &state);
543
+ }
544
+
545
+ i = (int)(t - ts->t);
546
+ if (i > MAX_WORD_SIZE) {
547
+ i = MAX_WORD_SIZE - 1;
548
+ }
549
+ memcpy(token, ts->t, i);
550
+ return i;
505
551
  }
506
552
 
507
- int std_get_alnum(TokenStream *ts, char *token)
553
+ /*
554
+ static int std_get_alnum(TokenStream *ts, char *token)
508
555
  {
509
- int i = 0;
510
- char *t = ts->t;
511
- while (t[i] != '\0' && isalnum(t[i])) {
512
- if (i < MAX_WORD_SIZE) token[i] = t[i];
513
- i++;
514
- }
515
- return i;
556
+ int i = 0;
557
+ char *t = ts->t;
558
+ while (t[i] != '\0' && isalnum(t[i])) {
559
+ if (i < MAX_WORD_SIZE) {
560
+ token[i] = t[i];
561
+ }
562
+ i++;
563
+ }
564
+ return i;
516
565
  }
517
566
 
518
- int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
567
+ static int mb_std_get_alnum(TokenStream *ts, char *token)
519
568
  {
520
- char *t = ts->t;
521
- wchar_t w;
522
- int i;
523
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
524
- while (w != 0 && iswalnum(w)) {
525
- t += i;
526
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
527
- }
569
+ char *t = ts->t;
570
+ wchar_t wchr;
571
+ int i;
572
+ mbstate_t state; ZEROSET(&state, mbstate_t);
528
573
 
529
- i = (int)(t - ts->t);
530
- if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
531
- memcpy(token, ts->t, i);
532
- return i;
574
+ i = mb_next_char(&wchr, t, &state);
533
575
 
576
+ while (wchr != 0 && iswalnum(wchr)) {
577
+ t += i;
578
+ i = mb_next_char(&wchr, t, &state);
579
+ }
580
+
581
+ i = (int)(t - ts->t);
582
+ if (i > MAX_WORD_SIZE) {
583
+ i = MAX_WORD_SIZE - 1;
584
+ }
585
+ memcpy(token, ts->t, i);
586
+ return i;
534
587
  }
588
+ */
535
589
 
536
- int isnumpunc(char c)
590
+ static int isnumpunc(char c)
537
591
  {
538
- return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
592
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
593
+ || c == '-');
539
594
  }
540
595
 
541
- int w_isnumpunc(wchar_t c)
596
+ static int w_isnumpunc(wchar_t c)
542
597
  {
543
- return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_' || c == L'-');
598
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
599
+ || c == L'-');
544
600
  }
545
601
 
546
- int isurlpunc(char c)
602
+ static int isurlpunc(char c)
547
603
  {
548
- return (c == '.' || c == '/' || c == '-' || c == '_');
604
+ return (c == '.' || c == '/' || c == '-' || c == '_');
549
605
  }
550
606
 
551
- int isurlc(char c)
607
+ static int isurlc(char c)
552
608
  {
553
- return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
609
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
554
610
  }
555
611
 
556
- int isurlxatpunc(char c)
612
+ static int isurlxatpunc(char c)
557
613
  {
558
- return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
614
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
559
615
  }
560
616
 
561
- int isurlxatc(char c)
617
+ static int isurlxatc(char c)
562
618
  {
563
- return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
619
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
620
+ || isalnum(c));
564
621
  }
565
622
 
566
- bool std_is_tok_char(char *c)
623
+ static bool std_is_tok_char(char *c)
567
624
  {
568
- if (isspace(*c)) return false; // most common so check first.
569
- if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
570
- *c == '@' || *c == '\'' || *c == ':')
571
- return true;
572
- return false;
625
+ if (isspace(*c)) {
626
+ return false; /* most common so check first. */
627
+ }
628
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
629
+ *c == '@' || *c == '\'' || *c == ':') {
630
+ return true;
631
+ }
632
+ return false;
573
633
  }
574
634
 
575
- bool w_std_is_tok_char(char *t)
635
+ static bool mb_std_is_tok_char(char *t)
576
636
  {
577
- wchar_t c;
578
- if ((mbtowc(&c, t, MB_CUR_MAX)) < 0)
579
- RAISE(IO_ERROR, ENC_ERR_MSG);
580
- if (iswspace(c)) return false; // most common so check first.
581
- if (iswalnum(c) || w_isnumpunc(c) || c == L'&' ||
582
- c == L'@' || c == L'\'' || c == L':')
583
- return true;
584
- return false;
637
+ wchar_t c;
638
+ mbstate_t state; ZEROSET(&state, mbstate_t);
639
+
640
+ if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
641
+ /* error which we can handle next time round. For now just return
642
+ * false so that we can return a token */
643
+ return false;
644
+ }
645
+ if (iswspace(c)) {
646
+ return false; /* most common so check first. */
647
+ }
648
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
649
+ || c == L':') {
650
+ return true;
651
+ }
652
+ return false;
585
653
  }
586
654
 
587
655
  /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
@@ -589,583 +657,669 @@ bool w_std_is_tok_char(char *t)
589
657
  * (alnum) = [a-zA-Z0-9]
590
658
  * (punc) = [_\/.,-]
591
659
  */
592
- int std_get_number(char *input)
593
- {
594
- int i = 0;
595
- int count = 0;
596
- int last_seen_digit = 2;
597
- int seen_digit = false;
598
-
599
- while (last_seen_digit >= 0) {
600
- while ((input[i] != '\0') && isalnum(input[i])) {
601
- if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
602
- if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
603
- i++;
604
- }
605
- last_seen_digit--;
606
- if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
607
-
608
- if (last_seen_digit >= 0)
660
+ static int std_get_number(char *input)
661
+ {
662
+ int i = 0;
663
+ int count = 0;
664
+ int last_seen_digit = 2;
665
+ int seen_digit = false;
666
+
667
+ while (last_seen_digit >= 0) {
668
+ while ((input[i] != '\0') && isalnum(input[i])) {
669
+ if ((last_seen_digit < 2) && isdigit(input[i])) {
670
+ last_seen_digit = 2;
671
+ }
672
+ if ((seen_digit == false) && isdigit(input[i])) {
673
+ seen_digit = true;
674
+ }
675
+ i++;
676
+ }
677
+ last_seen_digit--;
678
+ if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
679
+
680
+ if (last_seen_digit >= 0) {
681
+ count = i;
682
+ }
683
+ break;
684
+ }
609
685
  count = i;
610
- break;
611
- }
612
- count = i;
613
- i++;
614
- }
615
- if (seen_digit)
616
- return count;
617
- else
618
- return 0;
686
+ i++;
687
+ }
688
+ if (seen_digit) {
689
+ return count;
690
+ }
691
+ else {
692
+ return 0;
693
+ }
619
694
  }
620
695
 
621
- int std_get_apostrophe(char *input)
696
+ static int std_get_apostrophe(char *input)
622
697
  {
623
- char *t = input;
698
+ char *t = input;
624
699
 
625
- while (isalpha(*t) || *t == '\'')
626
- t++;
700
+ while (isalpha(*t) || *t == '\'') {
701
+ t++;
702
+ }
627
703
 
628
- return (int)(t - input);
704
+ return (int)(t - input);
629
705
  }
630
706
 
631
- int mb_std_get_apostrophe(char *input)
707
+ static int mb_std_get_apostrophe(char *input)
632
708
  {
633
- char *t = input;
634
- wchar_t w;
635
- int i;
709
+ char *t = input;
710
+ wchar_t wchr;
711
+ int i;
712
+ mbstate_t state; ZEROSET(&state, mbstate_t);
636
713
 
637
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
638
- while (iswalpha(w) || w == L'\'') {
639
- t += i;
640
- if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
641
- }
642
- return (int)(t - input);
714
+ i = mb_next_char(&wchr, t, &state);
715
+
716
+ while (iswalpha(wchr) || wchr == L'\'') {
717
+ t += i;
718
+ i = mb_next_char(&wchr, t, &state);
719
+ }
720
+ return (int)(t - input);
643
721
  }
644
722
 
645
- int std_get_url(char *input, char *token, int i)
723
+ static int std_get_url(char *input, char *token, int i)
646
724
  {
647
- while (isurlc(input[i])) {
648
- if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
649
- break; // can't have to puncs in a row
650
- if (i < MAX_WORD_SIZE) token[i] = input[i];
651
- i++;
652
- }
725
+ while (isurlc(input[i])) {
726
+ if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
727
+ break; /* can't have to puncs in a row */
728
+ }
729
+ if (i < MAX_WORD_SIZE) {
730
+ token[i] = input[i];
731
+ }
732
+ i++;
733
+ }
653
734
 
654
- //strip trailing puncs
655
- while (isurlpunc(input[i-1])) i--;
735
+ /* strip trailing puncs */
736
+ while (isurlpunc(input[i - 1])) {
737
+ i--;
738
+ }
656
739
 
657
- return i;
740
+ return i;
658
741
  }
659
742
 
660
743
  /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
661
- */
662
- int std_get_company_name(char *input)
744
+ */
745
+ static int std_get_company_name(char *input)
663
746
  {
664
- int i = 0;
665
- while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
666
- i++;
747
+ int i = 0;
748
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
749
+ i++;
750
+ }
667
751
 
668
- return i;
752
+ return i;
669
753
  }
670
754
 
671
- int mb_std_get_company_name(char *input, TokenStream *ts)
755
+ /*
756
+ static int mb_std_get_company_name(char *input, TokenStream *ts)
672
757
  {
673
- char *t = input;
674
- wchar_t wchr;
675
- int i;
758
+ char *t = input;
759
+ wchar_t wchr;
760
+ int i;
761
+ mbstate_t state; ZEROSET(&state, mbstate_t);
676
762
 
677
- MB_NEXT_CHAR;
678
- while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
679
- t += i;
680
- MB_NEXT_CHAR;
681
- }
763
+ i = mb_next_char(&wchr, t, &state);
764
+ while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
765
+ t += i;
766
+ i = mb_next_char(&wchr, t, &state);
767
+ }
682
768
 
683
- return (int)(t - input);
769
+ return (int)(t - input);
684
770
  }
771
+ */
685
772
 
686
- bool std_advance_to_start(TokenStream *ts)
773
+ static bool std_advance_to_start(TokenStream *ts)
687
774
  {
688
- char *t = ts->t;
689
- while (*t != '\0' && !isalnum(*t)) t++;
775
+ char *t = ts->t;
776
+ while (*t != '\0' && !isalnum(*t)) {
777
+ t++;
778
+ }
690
779
 
691
- ts->t = t;
780
+ ts->t = t;
692
781
 
693
- return (*t != '\0');
782
+ return (*t != '\0');
694
783
  }
695
784
 
696
- bool mb_std_advance_to_start(TokenStream *ts)
785
+ static bool mb_std_advance_to_start(TokenStream *ts)
697
786
  {
698
- int i;
699
- wchar_t w;
787
+ int i;
788
+ wchar_t wchr;
789
+ mbstate_t state; ZEROSET(&state, mbstate_t);
700
790
 
701
- if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
702
- while (w != 0 && !iswalnum(w)) {
703
- ts->t += i;
704
- if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
705
- }
791
+ i = mb_next_char(&wchr, ts->t, &state);
706
792
 
707
- return (w != 0);
708
- }
793
+ while (wchr != 0 && !iswalnum(wchr)) {
794
+ ts->t += i;
795
+ i = mb_next_char(&wchr, ts->t, &state);
796
+ }
709
797
 
710
- typedef struct StandardTokenizer {
711
- bool (*advance_to_start)(TokenStream *ts);
712
- bool (*is_tok_char)(char *c);
713
- int (*get_alpha)(TokenStream *ts, char *token);
714
- int (*get_apostrophe)(char *input);
715
- } StandardTokenizer;
798
+ return (wchr != 0);
799
+ }
716
800
 
717
- Token *std_next(TokenStream *ts)
801
+ static Token *std_next(TokenStream *ts)
718
802
  {
719
- StandardTokenizer *std_tz = (StandardTokenizer *)ts->data;
720
- char *s;
721
- char *t;
722
- char *start = NULL;
723
- char *num_end = NULL;
724
- char token[MAX_WORD_SIZE];
725
- int token_i = 0;
726
- int len;
727
- bool is_acronym;
728
- bool seen_at_symbol;
729
-
803
+ StandardTokenizer *std_tz = STDTS(ts);
804
+ char *s;
805
+ char *t;
806
+ char *start = NULL;
807
+ char *num_end = NULL;
808
+ char token[MAX_WORD_SIZE];
809
+ int token_i = 0;
810
+ int len;
811
+ bool is_acronym;
812
+ bool seen_at_symbol;
730
813
 
731
- if (!std_tz->advance_to_start(ts)) return NULL;
732
814
 
733
- start = t = ts->t;
734
- if (isdigit(*t)) {
735
- t += std_get_number(t);
736
- ts->t = t;
737
- tk_set_ts(ts->token, start, t, ts->text, 1);
738
- } else {
739
- token_i = std_tz->get_alpha(ts, token);
740
- t += token_i;
741
-
742
- if (!std_tz->is_tok_char(t)) {
743
- // very common case, ie a plain word, so check and return
744
- tk_set_ts(ts->token, start, t, ts->text, 1);
745
- ts->t = t;
746
- return ts->token;
747
- }
748
-
749
- if (*t == '\'') { // apostrophe case.
750
- t += std_tz->get_apostrophe(t);
751
- ts->t = t;
752
- len = (int)(t - start);
753
- // strip possesive
754
- if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
755
-
756
- tk_set_ts(ts->token, start, t, ts->text, 1);
757
- return ts->token;
758
- }
759
-
760
- if (*t == '&') { // apostrophe case.
761
- t += std_get_company_name(t);
762
- ts->t = t;
763
- tk_set_ts(ts->token, start, t, ts->text, 1);
764
- return ts->token;
765
- }
766
-
767
- if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
768
- num_end = start + std_get_number(start);
769
- if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
770
- ts->t = num_end;
771
- tk_set_ts(ts->token, start, num_end, ts->text, 1);
772
- return ts->token;
773
- }
774
- // else there may be a longer token so check
775
- }
776
-
777
- if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
778
- // check for a known url start
779
- token[token_i] = '\0';
780
- t += 3;
781
- while (*t == '/') t++;
782
- if (isalpha(*t) &&
783
- (memcmp(token, "ftp", 3) == 0 ||
784
- memcmp(token, "http", 4) == 0 ||
785
- memcmp(token, "https", 5) == 0 ||
786
- memcmp(token, "file", 4) == 0)) {
787
- len = std_get_url(t, token, 0); // dispose of first part of the URL
788
- } else { //still treat as url but keep the first part
789
- token_i = (int)(t - start);
790
- memcpy(token, start, token_i * sizeof(char));
791
- len = token_i + std_get_url(t, token, token_i); // keep start
792
- }
793
- ts->t = t + len;
794
- token[len] = 0;
795
- tk_set(ts->token, token, len, (int)(start - ts->text),
796
- (int)(ts->t - ts->text), 1);
797
- return ts->token;
798
- }
799
-
800
- // now see how long a url we can find.
801
- is_acronym = true;
802
- seen_at_symbol = false;
803
- while (isurlxatc(*t)) {
804
- if (is_acronym && !isalpha(*t) && (*t != '.')) {
805
- is_acronym = false;
806
- }
807
- if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
808
- break; // can't have two punctuation characters in a row
809
- }
810
- if (*t == '@') {
811
- if (seen_at_symbol) {
812
- break; // we can only have one @ symbol
813
- } else {
814
- seen_at_symbol = true;
815
- }
816
- }
817
- t++;
815
+ if (!std_tz->advance_to_start(ts)) {
816
+ return NULL;
818
817
  }
819
- while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
820
818
 
821
- if (t > num_end) {
822
- ts->t = t;
819
+ start = t = ts->t;
820
+ if (isdigit(*t)) {
821
+ t += std_get_number(t);
822
+ ts->t = t;
823
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
824
+ }
825
+ else {
826
+ token_i = std_tz->get_alpha(ts, token);
827
+ t += token_i;
828
+
829
+ if (!std_tz->is_tok_char(t)) {
830
+ /* very common case, ie a plain word, so check and return */
831
+ ts->t = t;
832
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
833
+ }
834
+
835
+ if (*t == '\'') { /* apostrophe case. */
836
+ t += std_tz->get_apostrophe(t);
837
+ ts->t = t;
838
+ len = (int)(t - start);
839
+ /* strip possesive */
840
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
841
+ t -= 2;
842
+ }
823
843
 
824
- if (is_acronym) { // check that it is one letter followed by one '.'
825
- for (s = start; s < t-1; s++) {
826
- if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
844
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
827
845
  }
828
- }
829
- if (is_acronym) {// strip '.'s
830
- for (s = start + token_i; s < t; s++) {
831
- if (*s != '.') {
832
- token[token_i] = *s;
833
- token_i++;
834
- }
846
+
847
+ if (*t == '&') { /* apostrophe case. */
848
+ t += std_get_company_name(t);
849
+ ts->t = t;
850
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
851
+ }
852
+
853
+ if (isdigit(*t) || isnumpunc(*t)) { /* possibly a number */
854
+ num_end = start + std_get_number(start);
855
+ if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
856
+ ts->t = num_end;
857
+ return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
858
+ }
859
+ /* else there may be a longer token so check */
860
+ }
861
+
862
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
863
+ /* check for a known url start */
864
+ token[token_i] = '\0';
865
+ t += 3;
866
+ while (*t == '/') {
867
+ t++;
868
+ }
869
+ if (isalpha(*t) &&
870
+ (memcmp(token, "ftp", 3) == 0 ||
871
+ memcmp(token, "http", 4) == 0 ||
872
+ memcmp(token, "https", 5) == 0 ||
873
+ memcmp(token, "file", 4) == 0)) {
874
+ len = std_get_url(t, token, 0); /* dispose of first part of the URL */
875
+ }
876
+ else { /* still treat as url but keep the first part */
877
+ token_i = (int)(t - start);
878
+ memcpy(token, start, token_i * sizeof(char));
879
+ len = token_i + std_get_url(t, token, token_i); /* keep start */
880
+ }
881
+ ts->t = t + len;
882
+ token[len] = 0;
883
+ return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
884
+ (int)(ts->t - ts->text), 1);
885
+ }
886
+
887
+ /* now see how long a url we can find. */
888
+ is_acronym = true;
889
+ seen_at_symbol = false;
890
+ while (isurlxatc(*t)) {
891
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
892
+ is_acronym = false;
893
+ }
894
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
895
+ break; /* can't have two punctuation characters in a row */
896
+ }
897
+ if (*t == '@') {
898
+ if (seen_at_symbol) {
899
+ break; /* we can only have one @ symbol */
900
+ }
901
+ else {
902
+ seen_at_symbol = true;
903
+ }
904
+ }
905
+ t++;
906
+ }
907
+ while (isurlxatpunc(t[-1])) {
908
+ t--; /* strip trailing punctuation */
909
+ }
910
+
911
+ if (t > num_end) {
912
+ ts->t = t;
913
+
914
+ if (is_acronym) { /* check it is one letter followed by one '.' */
915
+ for (s = start; s < t - 1; s++) {
916
+ if (isalpha(*s) && (s[1] != '.'))
917
+ is_acronym = false;
918
+ }
919
+ }
920
+ if (is_acronym) { /* strip '.'s */
921
+ for (s = start + token_i; s < t; s++) {
922
+ if (*s != '.') {
923
+ token[token_i] = *s;
924
+ token_i++;
925
+ }
926
+ }
927
+ tk_set(&(CTS(ts)->token), token, token_i,
928
+ (int)(start - ts->text),
929
+ (int)(t - ts->text), 1);
930
+ }
931
+ else { /* just return the url as is */
932
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
933
+ }
934
+ }
935
+ else { /* return the number */
936
+ ts->t = num_end;
937
+ tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
835
938
  }
836
- tk_set(ts->token, token, token_i, (int)(start - ts->text),
837
- (int)(t - ts->text), 1);
838
- } else { // just return the url as is
839
- tk_set_ts(ts->token, start, t, ts->text, 1);
840
- }
841
- } else { // return the number
842
- ts->t = num_end;
843
- tk_set_ts(ts->token, start, num_end, ts->text, 1);
844
939
  }
845
- }
846
940
 
847
- return ts->token;
941
+ return &(CTS(ts)->token);
848
942
  }
849
943
 
850
- void std_ts_destroy(TokenStream *ts)
944
+ static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
851
945
  {
852
- free(ts->data);
853
- ts_standard_destroy(ts);
946
+ return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
854
947
  }
855
948
 
856
- void std_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
949
+ static TokenStream *std_ts_new()
857
950
  {
858
- new_ts->data = ALLOC(StandardTokenizer);
859
- memcpy(new_ts->data, orig_ts->data, sizeof(StandardTokenizer));
951
+ TokenStream *ts = ts_new(StandardTokenizer);
952
+
953
+ ts->clone_i = &std_ts_clone_i;
954
+ ts->next = &std_next;
955
+
956
+ return ts;
860
957
  }
861
958
 
862
- TokenStream *standard_tokenizer_create()
959
+ TokenStream *standard_tokenizer_new()
863
960
  {
864
- TokenStream *ts = ts_create();
961
+ TokenStream *ts = std_ts_new();
865
962
 
866
- StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
867
- std_tz->advance_to_start = &std_advance_to_start;
868
- std_tz->get_alpha = &std_get_alpha;
869
- std_tz->is_tok_char = &std_is_tok_char;
870
- std_tz->get_apostrophe = &std_get_apostrophe;
963
+ STDTS(ts)->advance_to_start = &std_advance_to_start;
964
+ STDTS(ts)->get_alpha = &std_get_alpha;
965
+ STDTS(ts)->is_tok_char = &std_is_tok_char;
966
+ STDTS(ts)->get_apostrophe = &std_get_apostrophe;
871
967
 
872
- ts->data = std_tz;
873
- ts->destroy = &std_ts_destroy;
874
- ts->clone_i = &std_ts_clone_i;
875
- ts->next = &std_next;
876
- return ts;
968
+ return ts;
877
969
  }
878
970
 
879
- TokenStream *mb_standard_tokenizer_create()
971
+ TokenStream *mb_standard_tokenizer_new()
880
972
  {
881
- TokenStream *ts = ts_create();
973
+ TokenStream *ts = std_ts_new();
882
974
 
883
- StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
884
- std_tz->advance_to_start = &mb_std_advance_to_start;
885
- std_tz->get_alpha = &mb_std_get_alpha;
886
- std_tz->is_tok_char = &w_std_is_tok_char;
887
- std_tz->get_apostrophe = &mb_std_get_apostrophe;
975
+ STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
976
+ STDTS(ts)->get_alpha = &mb_std_get_alpha;
977
+ STDTS(ts)->is_tok_char = &mb_std_is_tok_char;
978
+ STDTS(ts)->get_apostrophe = &mb_std_get_apostrophe;
888
979
 
889
- ts->data = std_tz;
890
- ts->destroy = &std_ts_destroy;
891
- ts->clone_i = &std_ts_clone_i;
892
- ts->next = &std_next;
893
- return ts;
980
+ return ts;
894
981
  }
895
982
 
896
- void filter_reset(TokenStream *ts, char *text)
983
+ /****************************************************************************
984
+ *
985
+ * Filters
986
+ *
987
+ ****************************************************************************/
988
+
989
+ #define TkFilt(filter) ((TokenFilter *)(filter))
990
+
991
+ TokenStream *filter_clone_size(TokenStream *ts, size_t size)
897
992
  {
898
- ts->sub_ts->reset(ts->sub_ts, text);
993
+ TokenStream *ts_new = ts_clone_size(ts, size);
994
+ TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
995
+ return ts_new;
899
996
  }
900
997
 
901
- void filter_destroy(TokenStream *tf)
998
+ static TokenStream *filter_clone_i(TokenStream *ts)
902
999
  {
903
- ts_deref(tf->sub_ts);
904
- if (tf->token != NULL) tk_destroy(tf->token);
905
- free(tf);
1000
+ return filter_clone_size(ts, sizeof(TokenFilter));
906
1001
  }
907
1002
 
908
- void sf_destroy(TokenStream *tf)
1003
+ static TokenStream *filter_reset(TokenStream *ts, char *text)
1004
+ {
1005
+ TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
1006
+ return ts;
1007
+ }
1008
+
1009
+ static void filter_destroy_i(TokenStream *ts)
1010
+ {
1011
+ ts_deref(TkFilt(ts)->sub_ts);
1012
+ free(ts);
1013
+ }
1014
+
1015
+ #define tf_new(type, sub) tf_new_i(sizeof(type), sub)
1016
+ TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
1017
+ {
1018
+ TokenStream *ts = (TokenStream *)ecalloc(size);
1019
+
1020
+ TkFilt(ts)->sub_ts = sub_ts;
1021
+
1022
+ ts->clone_i = &filter_clone_i;
1023
+ ts->destroy_i = &filter_destroy_i;
1024
+ ts->reset = &filter_reset;
1025
+ ts->ref_cnt = 1;
1026
+
1027
+ return ts;
1028
+ }
1029
+
1030
+ /****************************************************************************
1031
+ * StopFilter
1032
+ ****************************************************************************/
1033
+
1034
+ #define StopFilt(filter) ((StopFilter *)(filter))
1035
+
1036
+ static void sf_destroy_i(TokenStream *ts)
909
1037
  {
910
- HshTable *words = (HshTable *)tf->data;
911
- h_destroy(words);
912
- filter_destroy(tf);
1038
+ h_destroy(StopFilt(ts)->words);
1039
+ filter_destroy_i(ts);
913
1040
  }
914
1041
 
915
- void sf_clone_i_i(void *key, void *value, void *arg)
1042
+ static void sf_clone_i_i(void *key, void *value, void *arg)
916
1043
  {
917
- HshTable *wordtable = (HshTable *)arg;
918
- char *w = estrdup(key);
919
- h_set(wordtable, w, w);
1044
+ HashTable *word_table = (HashTable *)arg;
1045
+ char *word = estrdup(key);
1046
+ (void)value;
1047
+ h_set(word_table, word, word);
920
1048
  }
921
1049
 
922
- void sf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
1050
+ static TokenStream *sf_clone_i(TokenStream *orig_ts)
923
1051
  {
924
- new_ts->data = h_new_str(&free, NULL);
925
- h_each(orig_ts->data, &sf_clone_i_i, new_ts->data);
1052
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StopFilter));
1053
+ StopFilt(new_ts)->words = h_new_str(&free, NULL);
1054
+ h_each(StopFilt(orig_ts)->words, &sf_clone_i_i, StopFilt(new_ts)->words);
1055
+ return new_ts;
926
1056
  }
927
1057
 
928
- Token *sf_next(TokenStream *tf)
1058
+ static Token *sf_next(TokenStream *ts)
929
1059
  {
930
- int pos_inc = 1;
931
- HshTable *words = (HshTable *)tf->data;
932
- Token *tk = tf->sub_ts->next(tf->sub_ts);
933
- while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
934
- tk = tf->sub_ts->next(tf->sub_ts);
935
- pos_inc++;
936
- }
937
- if (tk != NULL) tk->pos_inc = pos_inc;
938
- return tk;
1060
+ int pos_inc = 1;
1061
+ HashTable *words = StopFilt(ts)->words;
1062
+ TokenFilter *tf = TkFilt(ts);
1063
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1064
+
1065
+ while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
1066
+ tk = tf->sub_ts->next(tf->sub_ts);
1067
+ pos_inc++;
1068
+ }
1069
+
1070
+ if (tk != NULL) {
1071
+ tk->pos_inc = pos_inc;
1072
+ }
1073
+
1074
+ return tk;
939
1075
  }
940
1076
 
941
- TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
942
- const char **words, int len)
1077
+ TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
1078
+ const char **words, int len)
943
1079
  {
944
- int i;
945
- char *w;
946
- HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
947
- TokenStream *tf = ALLOC(TokenStream);
948
- tf->sub_ts = ts;
1080
+ int i;
1081
+ char *word;
1082
+ HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1083
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
949
1084
 
950
- for (i = 0; i < len; i++) {
951
- w = estrdup(words[i]);
952
- h_set(wordtable, w, w);
953
- }
954
- tf->data = wordtable;
955
- tf->token = NULL;
956
- tf->next = &sf_next;
957
- tf->reset = &filter_reset;
958
- tf->destroy = &sf_destroy;
959
- tf->clone_i = &sf_clone_i;
960
- tf->ref_cnt = 1;
961
- return tf;
1085
+ for (i = 0; i < len; i++) {
1086
+ word = estrdup(words[i]);
1087
+ h_set(word_table, word, word);
1088
+ }
1089
+ StopFilt(ts)->words = word_table;
1090
+ ts->next = &sf_next;
1091
+ ts->destroy_i = &sf_destroy_i;
1092
+ ts->clone_i = &sf_clone_i;
1093
+ return ts;
962
1094
  }
963
1095
 
964
- TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
1096
+ TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
1097
+ const char **words)
965
1098
  {
966
- char *w;
967
- HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
968
- TokenStream *tf = ALLOC(TokenStream);
969
- tf->sub_ts = ts;
970
- while (*words) {
971
- w = estrdup(*words);
972
- h_set(wordtable, w, w);
973
- words++;
974
- }
975
- tf->data = wordtable;
976
- tf->token = NULL;
977
- tf->next = &sf_next;
978
- tf->reset = &filter_reset;
979
- tf->destroy = &sf_destroy;
980
- tf->clone_i = &sf_clone_i;
981
- tf->ref_cnt = 1;
982
- return tf;
1099
+ char *word;
1100
+ HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1101
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
1102
+
1103
+ while (*words) {
1104
+ word = estrdup(*words);
1105
+ h_set(word_table, word, word);
1106
+ words++;
1107
+ }
1108
+
1109
+ StopFilt(ts)->words = word_table;
1110
+ ts->next = &sf_next;
1111
+ ts->destroy_i = &sf_destroy_i;
1112
+ ts->clone_i = &sf_clone_i;
1113
+ return ts;
983
1114
  }
984
1115
 
985
- TokenStream *stop_filter_create(TokenStream *ts)
1116
+ TokenStream *stop_filter_new(TokenStream *ts)
986
1117
  {
987
- return stop_filter_create_with_words(ts, FULL_ENGLISH_STOP_WORDS);
1118
+ return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
988
1119
  }
989
1120
 
1121
+ /****************************************************************************
1122
+ * LowerCaseFilter
1123
+ ****************************************************************************/
1124
+
1125
+
990
1126
  Token *mb_lcf_next(TokenStream *ts)
991
1127
  {
992
- wchar_t wbuf[MAX_WORD_SIZE], *w;
993
- //mbstate_t state = {0};
994
- int i;
995
- Token *tk = ts->sub_ts->next(ts->sub_ts);
996
- if (tk == NULL) return tk;
1128
+ wchar_t wbuf[MAX_WORD_SIZE], *wchr;
1129
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
997
1130
 
998
- i = (int)mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
999
- w = wbuf;
1000
- while (*w != 0) {
1001
- *w = towlower(*w);
1002
- w++;
1003
- }
1004
- wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
1005
- return tk;
1131
+ if (tk == NULL) {
1132
+ return tk;
1133
+ }
1134
+
1135
+ mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
1136
+ wchr = wbuf;
1137
+ while (*wchr != 0) {
1138
+ *wchr = towlower(*wchr);
1139
+ wchr++;
1140
+ }
1141
+ tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
1142
+ tk->text[tk->len] = '\0';
1143
+ return tk;
1006
1144
  }
1007
1145
 
1008
- TokenStream *mb_lowercase_filter_create(TokenStream *ts)
1146
+ TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
1009
1147
  {
1010
- TokenStream *tf = ALLOC(TokenStream);
1011
- tf->token = NULL;
1012
- tf->next = &mb_lcf_next;
1013
- tf->reset = &filter_reset;
1014
- tf->destroy = &filter_destroy;
1015
- tf->sub_ts = ts;
1016
- tf->clone_i = NULL;
1017
- tf->ref_cnt = 1;
1018
- return tf;
1148
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1149
+ ts->next = &mb_lcf_next;
1150
+ return ts;
1019
1151
  }
1020
1152
 
1021
1153
  Token *lcf_next(TokenStream *ts)
1022
1154
  {
1023
- int i = 0;
1024
- Token *tk = ts->sub_ts->next(ts->sub_ts);
1025
- if (tk == NULL) return tk;
1026
- while (tk->text[i] != '\0') {
1027
- tk->text[i] = tolower(tk->text[i]);
1028
- i++;
1029
- }
1030
- return tk;
1155
+ int i = 0;
1156
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1157
+ if (tk == NULL) {
1158
+ return tk;
1159
+ }
1160
+ while (tk->text[i] != '\0') {
1161
+ tk->text[i] = tolower(tk->text[i]);
1162
+ i++;
1163
+ }
1164
+ return tk;
1031
1165
  }
1032
1166
 
1033
- TokenStream *lowercase_filter_create(TokenStream *ts)
1167
+ TokenStream *lowercase_filter_new(TokenStream *sub_ts)
1034
1168
  {
1035
- TokenStream *tf = ALLOC(TokenStream);
1036
- tf->token = NULL;
1037
- tf->next = &lcf_next;
1038
- tf->reset = &filter_reset;
1039
- tf->destroy = &filter_destroy;
1040
- tf->sub_ts = ts;
1041
- tf->clone_i = NULL;
1042
- tf->ref_cnt = 1;
1043
- return tf;
1169
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1170
+ ts->next = &lcf_next;
1171
+ return ts;
1044
1172
  }
1045
1173
 
1046
- typedef struct StemFilter {
1047
- struct sb_stemmer *stemmer;
1048
- char *algorithm;
1049
- char *charenc;
1050
- } StemFilter;
1174
+ /****************************************************************************
1175
+ * StemFilter
1176
+ ****************************************************************************/
1177
+
1178
+ #define StemFilt(filter) ((StemFilter *)(filter))
1051
1179
 
1052
- void stemf_destroy(TokenStream *tf)
1180
+ void stemf_destroy_i(TokenStream *ts)
1053
1181
  {
1054
- StemFilter *stemf = (StemFilter *)tf->data;
1055
- sb_stemmer_delete(stemf->stemmer);
1056
- free(stemf->algorithm);
1057
- free(stemf->charenc);
1058
- free(stemf);
1059
- filter_destroy(tf);
1182
+ sb_stemmer_delete(StemFilt(ts)->stemmer);
1183
+ free(StemFilt(ts)->algorithm);
1184
+ free(StemFilt(ts)->charenc);
1185
+ filter_destroy_i(ts);
1060
1186
  }
1061
1187
 
1062
1188
  Token *stemf_next(TokenStream *ts)
1063
1189
  {
1064
- int len;
1065
- const sb_symbol *stemmed;
1066
- struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
1067
- Token *tk = ts->sub_ts->next(ts->sub_ts);
1068
- if (tk == NULL) return tk;
1069
- stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, (int)strlen(tk->text));
1070
- len = sb_stemmer_length(stemmer);
1071
- if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
1072
- memcpy(tk->text, stemmed, len);
1073
- tk->text[len] = '\0';
1074
- return tk;
1190
+ int len;
1191
+ const sb_symbol *stemmed;
1192
+ struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
1193
+ TokenFilter *tf = TkFilt(ts);
1194
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1195
+ if (tk == NULL) {
1196
+ return tk;
1197
+ }
1198
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
1199
+ len = sb_stemmer_length(stemmer);
1200
+ if (len >= MAX_WORD_SIZE) {
1201
+ len = MAX_WORD_SIZE - 1;
1202
+ }
1203
+
1204
+ memcpy(tk->text, stemmed, len);
1205
+ tk->text[len] = '\0';
1206
+ tk->len = len;
1207
+ return tk;
1075
1208
  }
1076
1209
 
1077
- void stemf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
1210
+ TokenStream *stemf_clone_i(TokenStream *orig_ts)
1078
1211
  {
1079
- StemFilter *orig_stemf = (StemFilter *)orig_ts->data;
1080
- StemFilter *stemf = ALLOC(StemFilter);
1081
- stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1082
- stemf->algorithm = orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1083
- stemf->charenc = orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1084
- new_ts->data = stemf;
1212
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
1213
+ StemFilter *stemf = StemFilt(new_ts);
1214
+ StemFilter *orig_stemf = StemFilt(orig_ts);
1215
+ stemf->stemmer =
1216
+ sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1217
+ stemf->algorithm =
1218
+ orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1219
+ stemf->charenc =
1220
+ orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1221
+ return new_ts;
1085
1222
  }
1086
1223
 
1087
- TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
1088
- const char * charenc)
1224
+ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1225
+ const char *charenc)
1089
1226
  {
1090
- TokenStream *tf = ALLOC(TokenStream);
1091
- StemFilter *stemf = ALLOC(StemFilter);
1092
- stemf->stemmer = sb_stemmer_new(algorithm, charenc);
1093
- stemf->algorithm = algorithm ? estrdup(algorithm) : NULL;
1094
- stemf->charenc = charenc ? estrdup(charenc) : NULL;
1095
- tf->data = stemf;
1227
+ TokenStream *tf = tf_new(StemFilter, ts);
1228
+
1229
+ StemFilt(tf)->stemmer = sb_stemmer_new(algorithm, charenc);
1230
+ StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
1231
+ StemFilt(tf)->charenc = charenc ? estrdup(charenc) : NULL;
1096
1232
 
1097
- tf->token = NULL;
1098
- tf->next = &stemf_next;
1099
- tf->reset = &filter_reset;
1100
- tf->destroy = &stemf_destroy;
1101
- tf->clone_i = &stemf_clone_i;
1102
- tf->sub_ts = ts;
1103
- tf->ref_cnt = 1;
1104
- return tf;
1233
+ tf->next = &stemf_next;
1234
+ tf->destroy_i = &stemf_destroy_i;
1235
+ tf->clone_i = &stemf_clone_i;
1236
+ return tf;
1105
1237
  }
1106
1238
 
1107
- Analyzer *standard_analyzer_create_with_words_len(
1108
- const char **words, int len, bool lowercase)
1239
+ /****************************************************************************
1240
+ *
1241
+ * Analyzers
1242
+ *
1243
+ ****************************************************************************/
1244
+
1245
+ /****************************************************************************
1246
+ * Standard
1247
+ ****************************************************************************/
1248
+
1249
+ Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
1250
+ bool lowercase)
1109
1251
  {
1110
- TokenStream *ts;
1111
- if (lowercase) {
1112
- ts = stop_filter_create_with_words_len(
1113
- lowercase_filter_create(standard_tokenizer_create()), words, len);
1114
- } else {
1115
- ts = stop_filter_create_with_words_len(
1116
- standard_tokenizer_create(), words, len);
1117
- }
1118
- return analyzer_create(NULL, ts, NULL, NULL);
1252
+ TokenStream *ts;
1253
+ if (lowercase) {
1254
+ ts = stop_filter_new_with_words_len(lowercase_filter_new
1255
+ (standard_tokenizer_new()),
1256
+ words, len);
1257
+ }
1258
+ else {
1259
+ ts = stop_filter_new_with_words_len(standard_tokenizer_new(),
1260
+ words, len);
1261
+ }
1262
+ return analyzer_new(ts, NULL, NULL);
1119
1263
  }
1120
1264
 
1121
- Analyzer *standard_analyzer_create_with_words(const char **words, bool lowercase)
1265
+ Analyzer *standard_analyzer_new_with_words(const char **words,
1266
+ bool lowercase)
1122
1267
  {
1123
- TokenStream *ts;
1124
- if (lowercase) {
1125
- ts = stop_filter_create_with_words(
1126
- lowercase_filter_create(standard_tokenizer_create()), words);
1127
- } else {
1128
- ts = stop_filter_create_with_words(
1129
- standard_tokenizer_create(), words);
1130
- }
1131
- return analyzer_create(NULL, ts, NULL, NULL);
1268
+ TokenStream *ts;
1269
+ if (lowercase) {
1270
+ ts = stop_filter_new_with_words(lowercase_filter_new
1271
+ (standard_tokenizer_new()),
1272
+ words);
1273
+ }
1274
+ else {
1275
+ ts = stop_filter_new_with_words(standard_tokenizer_new(),
1276
+ words);
1277
+ }
1278
+ return analyzer_new(ts, NULL, NULL);
1132
1279
  }
1133
1280
 
1134
- Analyzer *mb_standard_analyzer_create_with_words_len(
1135
- const char **words, int len, bool lowercase)
1281
+ Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
1282
+ int len, bool lowercase)
1136
1283
  {
1137
- TokenStream *ts;
1138
- if (lowercase) {
1139
- ts = stop_filter_create_with_words_len(
1140
- mb_lowercase_filter_create(mb_standard_tokenizer_create()), words, len);
1141
- } else {
1142
- ts = stop_filter_create_with_words_len(
1143
- mb_standard_tokenizer_create(), words, len);
1144
- }
1145
- return analyzer_create(NULL, ts, NULL, NULL);
1284
+ TokenStream *ts;
1285
+ if (lowercase) {
1286
+ ts = stop_filter_new_with_words_len(mb_lowercase_filter_new
1287
+ (mb_standard_tokenizer_new
1288
+ ()), words, len);
1289
+ }
1290
+ else {
1291
+ ts = stop_filter_new_with_words_len(mb_standard_tokenizer_new(),
1292
+ words, len);
1293
+ }
1294
+ return analyzer_new(ts, NULL, NULL);
1146
1295
  }
1147
1296
 
1148
- Analyzer *mb_standard_analyzer_create_with_words(
1149
- const char **words, bool lowercase)
1297
+ Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1298
+ bool lowercase)
1150
1299
  {
1151
- TokenStream *ts;
1152
- if (lowercase) {
1153
- ts = stop_filter_create_with_words(
1154
- mb_lowercase_filter_create(mb_standard_tokenizer_create()), words);
1155
- } else {
1156
- ts = stop_filter_create_with_words(mb_standard_tokenizer_create(), words);
1157
- }
1158
- return analyzer_create(NULL, ts, NULL, NULL);
1300
+ TokenStream *ts;
1301
+ if (lowercase) {
1302
+ ts = stop_filter_new_with_words(mb_lowercase_filter_new
1303
+ (mb_standard_tokenizer_new()),
1304
+ words);
1305
+ }
1306
+ else {
1307
+ ts = stop_filter_new_with_words(mb_standard_tokenizer_new(),
1308
+ words);
1309
+ }
1310
+ return analyzer_new(ts, NULL, NULL);
1159
1311
  }
1160
1312
 
1161
- Analyzer *standard_analyzer_create(bool lowercase)
1313
+ Analyzer *standard_analyzer_new(bool lowercase)
1162
1314
  {
1163
- return standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1315
+ return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1316
+ lowercase);
1164
1317
  }
1165
1318
 
1166
- Analyzer *mb_standard_analyzer_create(bool lowercase)
1319
+ Analyzer *mb_standard_analyzer_new(bool lowercase)
1167
1320
  {
1168
- return mb_standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1321
+ return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1322
+ lowercase);
1169
1323
  }
1170
1324
 
1171
1325
  /****************************************************************************
@@ -1174,58 +1328,63 @@ Analyzer *mb_standard_analyzer_create(bool lowercase)
1174
1328
  *
1175
1329
  ****************************************************************************/
1176
1330
 
1177
- void pfa_destroy(Analyzer *self)
1331
+ #define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
1332
+ void pfa_destroy_i(Analyzer *self)
1178
1333
  {
1179
- PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1180
- h_destroy(pfa->dict);
1334
+ h_destroy(PFA(self)->dict);
1181
1335
 
1182
- a_deref(pfa->def);
1183
- free(pfa);
1184
- free(self);
1336
+ a_deref(PFA(self)->default_a);
1337
+ free(self);
1185
1338
  }
1186
1339
 
1187
1340
  TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
1188
1341
  {
1189
- PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1190
- Analyzer *a = h_get(pfa->dict, field);
1191
- if (a == NULL) a = pfa->def;
1192
- return a_get_ts(a, field, text);
1342
+ Analyzer *a = h_get(PFA(self)->dict, field);
1343
+ if (a == NULL) {
1344
+ a = PFA(self)->default_a;
1345
+ }
1346
+ return a_get_ts(a, field, text);
1193
1347
  }
1194
1348
 
1195
- void pfa_sub_a_destroy(void *p)
1349
+ void pfa_sub_a_destroy_i(void *p)
1196
1350
  {
1197
- Analyzer *a = (Analyzer *)p;
1198
- a_deref(a);
1351
+ Analyzer *a = (Analyzer *) p;
1352
+ a_deref(a);
1199
1353
  }
1200
1354
 
1201
1355
  void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
1202
1356
  {
1203
- PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1204
- h_set(pfa->dict, estrdup(field), analyzer);
1357
+ h_set(PFA(self)->dict, estrdup(field), analyzer);
1205
1358
  }
1206
1359
 
1207
- Analyzer *per_field_analyzer_create(Analyzer *def)
1360
+ Analyzer *per_field_analyzer_new(Analyzer *default_a)
1208
1361
  {
1209
- PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
1210
- pfa->def = def;
1211
- pfa->dict = h_new_str(&free, &pfa_sub_a_destroy);
1212
- return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
1362
+ Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
1363
+
1364
+ PFA(a)->default_a = default_a;
1365
+ PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
1366
+
1367
+ a->destroy_i = &pfa_destroy_i;
1368
+ a->get_ts = pfa_get_ts;
1369
+
1370
+ return a;
1213
1371
  }
1214
1372
 
1215
1373
  #ifdef ALONE
1216
1374
  int main(int argc, char **argv)
1217
1375
  {
1218
- char buf[10000];
1219
- Analyzer *a = standard_analyzer_create(true);
1220
- TokenStream *ts;
1221
- Token *tk;
1222
- while (fgets(buf, 9999, stdin) != NULL) {
1223
- ts = a->get_ts(a, "hello", buf);
1224
- while ((tk = ts->next(ts)) != NULL) {
1225
- printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
1226
- }
1227
- printf("\n");
1228
- }
1229
- return 0;
1376
+ char buf[10000];
1377
+ Analyzer *a = standard_analyzer_new(true);
1378
+ TokenStream *ts;
1379
+ Token *tk;
1380
+ while (fgets(buf, 9999, stdin) != NULL) {
1381
+ ts = a_get_ts(a, "hello", buf);
1382
+ while ((tk = ts->next(ts)) != NULL) {
1383
+ printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
1384
+ }
1385
+ printf("\n");
1386
+ ts_deref(ts);
1387
+ }
1388
+ return 0;
1230
1389
  }
1231
1390
  #endif