sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
data/TODO ADDED
@@ -0,0 +1,17 @@
1
+ = TODO
2
+
3
+ * user defined sorting
4
+ * add field compression
5
+ * Fix highlighting to work for compressed fields
6
+ * Fix highlighting to work for external fields
7
+ * Add Ferret::Index::Index
8
+ * Fix:
9
+ > Working Query: field1:value1 AND NOT field2:value2
10
+ > Failing Query: field1:value1 AND ( NOT field2:value2 )
11
+
12
+ = Done
13
+ * Add string Sort descripter
14
+ * fix memory bug
15
+ * add MultiReader interface
16
+ * add lexicographical sort (byte sort)
17
+ * Add highlighting
@@ -0,0 +1,231 @@
1
+ = Quick Introduction to Ferret
2
+
3
+ The simplest way to use Ferret is through the Ferret::Index::Index class.
4
+ This is now aliased by Ferret::I for quick and easy access. Start by including
5
+ the Ferret module.
6
+
7
+ require 'ferret'
8
+ include Ferret
9
+
10
+ === Creating an index
11
+
12
+ To create an in memory index is very simple;
13
+
14
+ index = Index::Index.new()
15
+
16
+ To create a persistent index;
17
+
18
+ index = Index::Index.new(:path => '/path/to/index')
19
+
20
+ Both of these methods create new Indexes with the StandardAnalyzer. An
21
+ analyzer is what you use to divide the input data up into tokens which you can
22
+ search for later. If you'd like to use a different analyzer you can specify it
23
+ here, eg;
24
+
25
+ index = Index::Index.new(:path => '/path/to/index',
26
+ :analyzer => Analysis::WhiteSpaceAnalyzer.new)
27
+
28
+ For more options when creating an Index refer to Ferret::Index::Index.
29
+
30
+ === Adding Documents
31
+
32
+ To add a document you can simply add a string or an array of strings. This will
33
+ store all the strings in the "" (ie empty string) field (unless you specify the
34
+ default field when you create the index).
35
+
36
+ index << "This is a new document to be indexed"
37
+ index << ["And here", "is another", "new document", "to be indexed"]
38
+
39
+ But these are pretty simple documents. If this is all you want to index you
40
+ could probably just use SimpleSearch. So let's give our documents some fields;
41
+
42
+ index << {:title => "Programming Ruby", :content => "blah blah blah"}
43
+ index << {:title => "Programming Ruby", :content => "yada yada yada"}
44
+
45
+ Note the way that all field-names are Symbols. Although Strings will work,
46
+ this is a best-practice in Ferret. Or if you are indexing data stored in a
47
+ database, you'll probably want to store the id;
48
+
49
+ index << {:id => row.id, :title => row.title, :date => row.date}
50
+
51
+ So far we have been storing and tokenizing all of the input data along with
52
+ term vectors. If we want to change this we need to change the way we setup the
53
+ index. You must create a FieldInfos object describing the index:
54
+
55
+ field_infos = FieldInfos.new(:store => :no,
56
+ :index => :untokenized_omit_norms,
57
+ :term_vector => :no)
58
+
59
+ The values that you set FieldInfos to have will be used by default by all
60
+ fields. If you want to change the properties for specific fields, you need to
61
+ add a FieldInfo to field_infos.
62
+
63
+ field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
64
+ field_infos.add_field(:content, :store => :yes,
65
+ :index => :yes,
66
+ :term_vector => :with_positions_offsets)
67
+
68
+ If you need to add a field to an already open index you do so like this:
69
+
70
+ index.field_infos.add_field(:new_field, :store => :yes)
71
+
72
+ === Searching
73
+
74
+ Now that we have data in our index, how do we actually use this index to
75
+ search the data? The Index offers two search methods, Index#search and
76
+ Index#search_each. The first method returns a Ferret::Index::TopDocs object.
77
+ The second we'll show here. Lets say we wanted to find all documents with the
78
+ phrase "quick brown fox" in the content field. We'd write;
79
+
80
+ index.search_each('content:"quick brown fox"') do |id, score|
81
+ puts "Document #{id} found with a score of #{score}"
82
+ end
83
+
84
+ But "fast" has a pretty similar meaning to "quick" and we don't mind if the
85
+ fox is a little red. Also, the phrase could be in the title so we'll search
86
+ there as well. So we could expand our search like this;
87
+
88
+ index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
89
+ puts "Document #{id} found with a score of #{score}"
90
+ end
91
+
92
+ What if we want to find all documents entered on or after 5th of September,
93
+ 2005 with the words "ruby" or "rails" in any field. We could type something like;
94
+
95
+ index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
96
+ puts "Document #{index[id][:title]} found with a score of #{score}"
97
+ end
98
+
99
+ Ferret has quite a complex query language. To find out more about Ferret's
100
+ query language, see Ferret::QueryParser. You can also construct even more
101
+ complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
102
+ for more information.
103
+
104
+ === Highlighting
105
+
106
+ Ferret now has a super-fast highlighting method. See
107
+ Ferret::Index::Index#highlight. Here is an example of how you would use it
108
+ when printing to the console:
109
+
110
+ index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
111
+ puts "Document #{index[id][:title]} found with a score of #{score}"
112
+ highlights = index.highlight("content:(ruby OR rails)", 0,
113
+ :field => :content,
114
+ :pre_tag = "\033[36m",
115
+ :post_tag = "\033[m")
116
+ puts highlights
117
+ end
118
+
119
+ And if you want to highlight a whole document, set :excerpt_length to :all:
120
+
121
+ puts index.highlight(query, doc_id,
122
+ :field => :content,
123
+ :pre_tag = "\033[36m",
124
+ :post_tag = "\033[m",
125
+ :excerpt_length => :all)
126
+
127
+ === Accessing Documents
128
+
129
+ You may have noticed that when we run a search we only get the document id
130
+ back. By itself this isn't much use to us. Getting the data from the index is
131
+ very straightforward. For example if we want the :title field form the 3rd
132
+ document type;
133
+
134
+ index[2][:title]
135
+
136
+ Documents are lazy loading so if you try this:
137
+
138
+ puts index[2]
139
+
140
+ You will always get an empty Hash. To load all fields, call the load method:
141
+
142
+ puts index[2].load
143
+
144
+ NOTE: documents are indexed from 0. You can also use array-like index
145
+ parameters to access index. For example
146
+
147
+ index[1..4]
148
+ index[10, 10]
149
+ index[-5]
150
+
151
+ The default field is :id (although you can change this with index's
152
+ :default_create_field parameter);
153
+
154
+ index << "This is a document"
155
+ index[0][:id]
156
+
157
+ Let's go back to the database example above. If we store all of our documents
158
+ with an id then we can access that field using the id. As long as we called
159
+ our id field :id we can do this
160
+
161
+ index["89721347"]["title"]
162
+
163
+ Pretty simple huh? You should note though that if there are more then one
164
+ document with the same *id* or *key* then only the first one will be returned
165
+ so it is probably better that you ensure the key is unique somehow. By setting
166
+ Index's :key attribute to :id, Ferret will do this automatically for you. It
167
+ can even handle multiple field primary keys. For example, you could set to
168
+ :key to [:id, :model] and Ferret would keep the documents unique for that pair
169
+ of fields.
170
+
171
+ === Modifying and Deleting Documents
172
+
173
+ What if we want to change the data in the index. Ferret doesn't actually let
174
+ you change the data once it is in the index. But you can delete documents so
175
+ the standard way to modify data is to delete it and re-add it again with the
176
+ modifications made. It is important to note that when doing this the documents
177
+ will get a new document number so you should be careful not to use a document
178
+ number after the document has been deleted. Here is an example of modifying a
179
+ document;
180
+
181
+ index << {:title => "Programing Rbuy", :content => "blah blah blah"}
182
+ doc_num = nil
183
+ index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
184
+ return unless doc_id
185
+ doc = index[doc_id]
186
+ index.delete(doc_id)
187
+
188
+ # modify doc. It is just a Hash after all
189
+ doc[:title] = "Programming Ruby"
190
+
191
+ index << doc
192
+
193
+ If you set the :key parameter as described in the last section there is no
194
+ need to delete the document. It will be automatically deleted when you add
195
+ another document with the same key.
196
+
197
+ Also, we can use the id field, as above, to delete documents. This time though
198
+ every document that matches the id will be deleted. Again, it is probably a
199
+ good idea if you somehow ensure that your *ids* are kept unique.
200
+
201
+ id = "23453422"
202
+ index.delete(id)
203
+
204
+ === Onwards
205
+
206
+ This is just a small sampling of what Ferret allows you to do. Ferret, like
207
+ Lucene, is designed to be extended, and allows you to construct your own query
208
+ types, analyzers, and so on. Going onwards you should check out the following
209
+ documentation:
210
+
211
+ * Ferret::Analysis: for more information on how the data is processed when it
212
+ is tokenized. There are a number of things you can do with your data such as
213
+ adding stop lists or perhaps a porter stemmer. There are also a number of
214
+ analyzers already available and it is almost trivial to create a new one
215
+ with a simple regular expression.
216
+
217
+ * Ferret::Search: for more information on querying the index. There are a
218
+ number of already available queries and it's unlikely you'll need to create
219
+ your own. You may however want to take advantage of the sorting or filtering
220
+ abilities of Ferret to present your data the best way you see fit.
221
+
222
+ * Ferret::QueryParser: if you want to find out more about what you can do with
223
+ Ferret's Query Parser, this is the place to look. The query parser is one
224
+ area that could use a bit of work so please send your suggestions.
225
+
226
+ * Ferret::Index: for more advanced access to the index you'll probably want to
227
+ use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
228
+ the place to look for more information on them.
229
+
230
+ * Ferret::Store: This is the module used to access the actual index storage
231
+ and won't be of much interest to most people.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
4
+ require 'ferret'
5
+ require 'ferret/browser'
6
+
7
+ require 'optparse'
8
+ require 'ostruct'
9
+
10
+ SERVER_OPTIONS = ['webrick']
11
+ conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
12
+
13
+ opts = OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{File.basename($0)} /path/to/index"
15
+ opts.separator ""
16
+ opts.separator "Specific Options:"
17
+
18
+ opts.on("-h", "--host HOSTNAME",
19
+ "Host for web server to bind to (default is all IPs)") { |conf.host| }
20
+ opts.on("-p", "--port NUM",
21
+ "Port for web server (defaults to #{conf.port})") { |conf.port| }
22
+ opts.on("-s", "--server NAME",
23
+ "Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
24
+
25
+ opts.separator ""
26
+ opts.separator "Common options:"
27
+
28
+ opts.on_tail("-?", "--help", "Show this message") do
29
+ puts opts
30
+ exit
31
+ end
32
+
33
+ opts.on_tail("-v", "--version", "Show version") do
34
+ puts Ferret::VERSION
35
+ exit
36
+ end
37
+ end
38
+
39
+ opts.parse! ARGV
40
+ if ARGV.length != 1
41
+ puts opts
42
+ exit
43
+ end
44
+ @path = ARGV[0]
45
+
46
+ # Load the Ferret index
47
+ begin
48
+ @reader = Ferret::Index::IndexReader.new(@path)
49
+ rescue Ferret::FileNotFoundError => e
50
+ puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
51
+ "\033[33m#{@path}\033[m\033[31m\".\033[m"
52
+ exit
53
+ rescue Exception => e
54
+ puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
55
+ exit
56
+ end
57
+
58
+ unless conf.server
59
+ conf.server = :webrick
60
+ end
61
+
62
+ case conf.server.to_s
63
+ when 'webrick'
64
+ require 'webrick/httpserver'
65
+ require 'ferret/browser/webrick'
66
+
67
+ # Mount the root
68
+ s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
69
+ s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
70
+ s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
71
+
72
+ # Server up
73
+ trap(:INT) do
74
+ s.shutdown
75
+ end
76
+ s.start
77
+ else
78
+ raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
79
+ end
@@ -0,0 +1,1555 @@
1
+ #include "analysis.h"
2
+ #include "hash.h"
3
+ #include "libstemmer.h"
4
+ #include <string.h>
5
+ #include <ctype.h>
6
+ #include <wctype.h>
7
+ #include <wchar.h>
8
+
9
+ /****************************************************************************
10
+ *
11
+ * Token
12
+ *
13
+ ****************************************************************************/
14
+
15
+ INLINE Token *tk_set(Token *tk,
16
+ char *text, int tlen, off_t start, off_t end, int pos_inc)
17
+ {
18
+ if (tlen >= MAX_WORD_SIZE) {
19
+ tlen = MAX_WORD_SIZE - 1;
20
+ }
21
+ memcpy(tk->text, text, sizeof(char) * tlen);
22
+ tk->text[tlen] = '\0';
23
+ tk->len = tlen;
24
+ tk->start = start;
25
+ tk->end = end;
26
+ tk->pos_inc = pos_inc;
27
+ return tk;
28
+ }
29
+
30
+ INLINE Token *tk_set_ts(Token *tk,
31
+ char *start, char *end, char *text, int pos_inc)
32
+ {
33
+ return tk_set(tk, start, (int)(end - start),
34
+ (off_t)(start - text), (off_t)(end - text), pos_inc);
35
+ }
36
+
37
+ INLINE Token *tk_set_no_len(Token *tk,
38
+ char *text, off_t start, off_t end, int pos_inc)
39
+ {
40
+ return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
41
+ }
42
+
43
+ INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
44
+ int pos_inc)
45
+ {
46
+ int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
47
+ tk->text[len] = '\0';
48
+ tk->len = len;
49
+ tk->start = start;
50
+ tk->end = end;
51
+ tk->pos_inc = pos_inc;
52
+ return tk;
53
+ }
54
+
55
+ int tk_eq(Token *tk1, Token *tk2)
56
+ {
57
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
58
+ tk1->start == tk2->start && tk1->end == tk2->end &&
59
+ tk1->pos_inc == tk2->pos_inc);
60
+ }
61
+
62
+ int tk_cmp(Token *tk1, Token *tk2)
63
+ {
64
+ int cmp;
65
+ if (tk1->start > tk2->start) {
66
+ cmp = 1;
67
+ }
68
+ else if (tk1->start < tk2->start) {
69
+ cmp = -1;
70
+ }
71
+ else {
72
+ if (tk1->end > tk2->end) {
73
+ cmp = 1;
74
+ }
75
+ else if (tk1->end < tk2->end) {
76
+ cmp = -1;
77
+ }
78
+ else {
79
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
80
+ }
81
+ }
82
+ return cmp;
83
+ }
84
+
85
+ void tk_destroy(void *p)
86
+ {
87
+ free(p);
88
+ }
89
+
90
+ Token *tk_new()
91
+ {
92
+ return ALLOC(Token);
93
+ }
94
+
95
+ /****************************************************************************
96
+ *
97
+ * TokenStream
98
+ *
99
+ ****************************************************************************/
100
+
101
+ void ts_deref(TokenStream *ts)
102
+ {
103
+ if (--ts->ref_cnt <= 0) {
104
+ ts->destroy_i(ts);
105
+ }
106
+ }
107
+
108
+ static TokenStream *ts_reset(TokenStream *ts, char *text)
109
+ {
110
+ ts->t = ts->text = text;
111
+ return ts;
112
+ }
113
+
114
+ TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
115
+ {
116
+ TokenStream *ts = (TokenStream *)ecalloc(size);
117
+ memcpy(ts, orig_ts, size);
118
+ ts->ref_cnt = 1;
119
+ return ts;
120
+ }
121
+
122
+ TokenStream *ts_new_i(size_t size)
123
+ {
124
+ TokenStream *ts = ecalloc(size);
125
+
126
+ ts->destroy_i = (void (*)(TokenStream *))&free;
127
+ ts->reset = &ts_reset;
128
+ ts->ref_cnt = 1;
129
+
130
+ return ts;
131
+ }
132
+
133
+ /****************************************************************************
134
+ * CachedTokenStream
135
+ ****************************************************************************/
136
+
137
+ #define CTS(token_stream) ((CachedTokenStream *)(token_stream))
138
+
139
+ static TokenStream *cts_clone_i(TokenStream *orig_ts)
140
+ {
141
+ return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
142
+ }
143
+
144
+ static TokenStream *cts_new()
145
+ {
146
+ TokenStream *ts = ts_new(CachedTokenStream);
147
+ ts->clone_i = &cts_clone_i;
148
+ return ts;
149
+ }
150
+
151
+ /* * Multi-byte TokenStream * */
152
+
153
+ #define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
154
+
155
+ INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
156
+ {
157
+ int num_bytes;
158
+ if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
159
+ const char *t = s;
160
+ do {
161
+ t++;
162
+ ZEROSET(state, mbstate_t);
163
+ num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
164
+ } while ((num_bytes < 0) && (*t != 0));
165
+ num_bytes = t - s;
166
+ if (*t == 0) *wchr = 0;
167
+ }
168
+ return num_bytes;
169
+ }
170
+
171
+ static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
172
+ {
173
+ ZEROSET(&(MBTS(ts)->state), mbstate_t);
174
+ ts_reset(ts, text);
175
+ return ts;
176
+ }
177
+
178
+ static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
179
+ {
180
+ return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
181
+ }
182
+
183
+ TokenStream *mb_ts_new()
184
+ {
185
+ TokenStream *ts = ts_new(MultiByteTokenStream);
186
+ ts->reset = &mb_ts_reset;
187
+ ts->clone_i = &mb_ts_clone_i;
188
+ ts->ref_cnt = 1;
189
+ return ts;
190
+ }
191
+
192
+ /****************************************************************************
193
+ *
194
+ * Analyzer
195
+ *
196
+ ****************************************************************************/
197
+
198
+ void a_deref(Analyzer *a)
199
+ {
200
+ if (--a->ref_cnt <= 0) {
201
+ a->destroy_i(a);
202
+ }
203
+ }
204
+
205
+ static void a_standard_destroy_i(Analyzer *a)
206
+ {
207
+ if (a->current_ts) {
208
+ ts_deref(a->current_ts);
209
+ }
210
+ free(a);
211
+ }
212
+
213
+ static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
214
+ {
215
+ TokenStream *ts;
216
+ (void)field;
217
+ ts = ts_clone(a->current_ts);
218
+ return ts->reset(ts, text);
219
+ }
220
+
221
+ Analyzer *analyzer_new(TokenStream *ts,
222
+ void (*destroy_i)(Analyzer *a),
223
+ TokenStream *(*get_ts)(Analyzer *a, char *field,
224
+ char *text))
225
+ {
226
+ Analyzer *a = ALLOC(Analyzer);
227
+ a->current_ts = ts;
228
+ a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
229
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
230
+ a->ref_cnt = 1;
231
+ return a;
232
+ }
233
+
234
+ /****************************************************************************
235
+ *
236
+ * Non
237
+ *
238
+ ****************************************************************************/
239
+
240
+ /*
241
+ * NonTokenizer
242
+ */
243
+ static Token *nt_next(TokenStream *ts)
244
+ {
245
+ if (ts->t) {
246
+ size_t len = strlen(ts->t);
247
+ ts->t = NULL;
248
+
249
+ return tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
250
+ }
251
+ else {
252
+ return NULL;
253
+ }
254
+ }
255
+
256
+ TokenStream *non_tokenizer_new()
257
+ {
258
+ TokenStream *ts = cts_new();
259
+ ts->next = &nt_next;
260
+ return ts;
261
+ }
262
+
263
+ /*
264
+ * NonAnalyzer
265
+ */
266
+ Analyzer *non_analyzer_new()
267
+ {
268
+ return analyzer_new(non_tokenizer_new(), NULL, NULL);
269
+ }
270
+
271
+ /****************************************************************************
272
+ *
273
+ * Whitespace
274
+ *
275
+ ****************************************************************************/
276
+
277
+ /*
278
+ * WhitespaceTokenizer
279
+ */
280
+ static Token *wst_next(TokenStream *ts)
281
+ {
282
+ char *t = ts->t;
283
+ char *start;
284
+
285
+ while (*t != '\0' && isspace(*t)) {
286
+ t++;
287
+ }
288
+
289
+ if (*t == '\0') {
290
+ return NULL;
291
+ }
292
+
293
+ start = t;
294
+ while (*t != '\0' && !isspace(*t)) {
295
+ t++;
296
+ }
297
+
298
+ ts->t = t;
299
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
300
+ }
301
+
302
+ TokenStream *whitespace_tokenizer_new()
303
+ {
304
+ TokenStream *ts = cts_new();
305
+ ts->next = &wst_next;
306
+ return ts;
307
+ }
308
+
309
+ /*
310
+ * Multi-byte WhitespaceTokenizer
311
+ */
312
+ static Token *mb_wst_next(TokenStream *ts)
313
+ {
314
+ int i;
315
+ char *start;
316
+ char *t = ts->t;
317
+ wchar_t wchr;
318
+ mbstate_t *state = &(MBTS(ts)->state);
319
+
320
+ i = mb_next_char(&wchr, t, state);
321
+ while (wchr != 0 && iswspace(wchr)) {
322
+ t += i;
323
+ i = mb_next_char(&wchr, t, state);
324
+ }
325
+ if (wchr == 0) {
326
+ return NULL;
327
+ }
328
+
329
+ start = t;
330
+ t += i;
331
+ i = mb_next_char(&wchr, t, state);
332
+ while (wchr != 0 && !iswspace(wchr)) {
333
+ t += i;
334
+ i = mb_next_char(&wchr, t, state);
335
+ }
336
+ ts->t = t;
337
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
338
+ }
339
+
340
+ /*
341
+ * Lowercasing Multi-byte WhitespaceTokenizer
342
+ */
343
+ static Token *mb_wst_next_lc(TokenStream *ts)
344
+ {
345
+ int i;
346
+ char *start;
347
+ char *t = ts->t;
348
+ wchar_t wchr;
349
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
350
+ mbstate_t *state = &(MBTS(ts)->state);
351
+
352
+ w = wbuf;
353
+ w_end = &wbuf[MAX_WORD_SIZE];
354
+
355
+ i = mb_next_char(&wchr, t, state);
356
+ while (wchr != 0 && iswspace(wchr)) {
357
+ t += i;
358
+ i = mb_next_char(&wchr, t, state);
359
+ }
360
+ if (wchr == 0) {
361
+ return NULL;
362
+ }
363
+
364
+ start = t;
365
+ t += i;
366
+ *w++ = towlower(wchr);
367
+ i = mb_next_char(&wchr, t, state);
368
+ while (wchr != 0 && !iswspace(wchr)) {
369
+ if (w < w_end) {
370
+ *w++ = towlower(wchr);
371
+ }
372
+ t += i;
373
+ i = mb_next_char(&wchr, t, state);
374
+ }
375
+ *w = 0;
376
+ ts->t = t;
377
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
378
+ (off_t)(t - ts->text), 1);
379
+ }
380
+
381
+ TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
382
+ {
383
+ TokenStream *ts = mb_ts_new();
384
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
385
+ return ts;
386
+ }
387
+
388
+ /*
389
+ * WhitespaceAnalyzers
390
+ */
391
+ Analyzer *whitespace_analyzer_new(bool lowercase)
392
+ {
393
+ TokenStream *ts;
394
+ if (lowercase) {
395
+ ts = lowercase_filter_new(whitespace_tokenizer_new());
396
+ }
397
+ else {
398
+ ts = whitespace_tokenizer_new();
399
+ }
400
+ return analyzer_new(ts, NULL, NULL);
401
+ }
402
+
403
+ Analyzer *mb_whitespace_analyzer_new(bool lowercase)
404
+ {
405
+ return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
406
+ }
407
+
408
+ /****************************************************************************
409
+ *
410
+ * Letter
411
+ *
412
+ ****************************************************************************/
413
+
414
+ /*
415
+ * LetterTokenizer
416
+ */
417
+ Token *lt_next(TokenStream *ts)
418
+ {
419
+ char *start;
420
+ char *t = ts->t;
421
+
422
+ while (*t != '\0' && !isalpha(*t)) {
423
+ t++;
424
+ }
425
+
426
+ if (*t == '\0') {
427
+ return NULL;
428
+ }
429
+
430
+ start = t;
431
+ while (*t != '\0' && isalpha(*t)) {
432
+ t++;
433
+ }
434
+
435
+ ts->t = t;
436
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
437
+ }
438
+
439
+ TokenStream *letter_tokenizer_new()
440
+ {
441
+ TokenStream *ts = cts_new();
442
+ ts->next = &lt_next;
443
+ return ts;
444
+ }
445
+
446
+ /*
447
+ * Multi-byte LetterTokenizer
448
+ */
449
+ Token *mb_lt_next(TokenStream *ts)
450
+ {
451
+ int i;
452
+ char *start;
453
+ char *t = ts->t;
454
+ wchar_t wchr;
455
+ mbstate_t *state = &(MBTS(ts)->state);
456
+
457
+ i = mb_next_char(&wchr, t, state);
458
+ while (wchr != 0 && !iswalpha(wchr)) {
459
+ t += i;
460
+ i = mb_next_char(&wchr, t, state);
461
+ }
462
+
463
+ if (wchr == 0) {
464
+ return NULL;
465
+ }
466
+
467
+ start = t;
468
+ t += i;
469
+ i = mb_next_char(&wchr, t, state);
470
+ while (wchr != 0 && iswalpha(wchr)) {
471
+ t += i;
472
+ i = mb_next_char(&wchr, t, state);
473
+ }
474
+ ts->t = t;
475
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
476
+ }
477
+
478
+ /*
479
+ * Lowercasing Multi-byte LetterTokenizer
480
+ */
481
+ Token *mb_lt_next_lc(TokenStream *ts)
482
+ {
483
+ int i;
484
+ char *start;
485
+ char *t = ts->t;
486
+ wchar_t wchr;
487
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
488
+ mbstate_t *state = &(MBTS(ts)->state);
489
+
490
+ w = wbuf;
491
+ w_end = &wbuf[MAX_WORD_SIZE];
492
+
493
+ i = mb_next_char(&wchr, t, state);
494
+ while (wchr != 0 && !iswalpha(wchr)) {
495
+ t += i;
496
+ i = mb_next_char(&wchr, t, state);
497
+ }
498
+ if (wchr == 0) {
499
+ return NULL;
500
+ }
501
+
502
+ start = t;
503
+ t += i;
504
+ *w++ = towlower(wchr);
505
+ i = mb_next_char(&wchr, t, state);
506
+ while (wchr != 0 && iswalpha(wchr)) {
507
+ if (w < w_end) {
508
+ *w++ = towlower(wchr);
509
+ }
510
+ t += i;
511
+ i = mb_next_char(&wchr, t, state);
512
+ }
513
+ *w = 0;
514
+ ts->t = t;
515
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
516
+ (off_t)(t - ts->text), 1);
517
+ }
518
+
519
+ TokenStream *mb_letter_tokenizer_new(bool lowercase)
520
+ {
521
+ TokenStream *ts = mb_ts_new();
522
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
523
+ return ts;
524
+ }
525
+
526
+ /*
527
+ * LetterAnalyzers
528
+ */
529
+ Analyzer *letter_analyzer_new(bool lowercase)
530
+ {
531
+ TokenStream *ts;
532
+ if (lowercase) {
533
+ ts = lowercase_filter_new(letter_tokenizer_new());
534
+ }
535
+ else {
536
+ ts = letter_tokenizer_new();
537
+ }
538
+ return analyzer_new(ts, NULL, NULL);
539
+ }
540
+
541
+ Analyzer *mb_letter_analyzer_new(bool lowercase)
542
+ {
543
+ return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
544
+ }
545
+
546
+ /****************************************************************************
547
+ *
548
+ * Standard
549
+ *
550
+ ****************************************************************************/
551
+
552
+ #define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
553
+
554
+ /*
555
+ * StandardTokenizer
556
+ */
557
+ static int std_get_alpha(TokenStream *ts, char *token)
558
+ {
559
+ int i = 0;
560
+ char *t = ts->t;
561
+ while (t[i] != '\0' && isalnum(t[i])) {
562
+ if (i < MAX_WORD_SIZE) {
563
+ token[i] = t[i];
564
+ }
565
+ i++;
566
+ }
567
+ return i;
568
+ }
569
+
570
+ static int mb_std_get_alpha(TokenStream *ts, char *token)
571
+ {
572
+ char *t = ts->t;
573
+ wchar_t wchr;
574
+ int i;
575
+ mbstate_t state; ZEROSET(&state, mbstate_t);
576
+
577
+ i = mb_next_char(&wchr, t, &state);
578
+
579
+ while (wchr != 0 && iswalnum(wchr)) {
580
+ t += i;
581
+ i = mb_next_char(&wchr, t, &state);
582
+ }
583
+
584
+ i = (int)(t - ts->t);
585
+ if (i > MAX_WORD_SIZE) {
586
+ i = MAX_WORD_SIZE - 1;
587
+ }
588
+ memcpy(token, ts->t, i);
589
+ return i;
590
+ }
591
+
592
+ /*
593
+ static int std_get_alnum(TokenStream *ts, char *token)
594
+ {
595
+ int i = 0;
596
+ char *t = ts->t;
597
+ while (t[i] != '\0' && isalnum(t[i])) {
598
+ if (i < MAX_WORD_SIZE) {
599
+ token[i] = t[i];
600
+ }
601
+ i++;
602
+ }
603
+ return i;
604
+ }
605
+
606
+ static int mb_std_get_alnum(TokenStream *ts, char *token)
607
+ {
608
+ char *t = ts->t;
609
+ wchar_t wchr;
610
+ int i;
611
+ mbstate_t state; ZEROSET(&state, mbstate_t);
612
+
613
+ i = mb_next_char(&wchr, t, &state);
614
+
615
+ while (wchr != 0 && iswalnum(wchr)) {
616
+ t += i;
617
+ i = mb_next_char(&wchr, t, &state);
618
+ }
619
+
620
+ i = (int)(t - ts->t);
621
+ if (i > MAX_WORD_SIZE) {
622
+ i = MAX_WORD_SIZE - 1;
623
+ }
624
+ memcpy(token, ts->t, i);
625
+ return i;
626
+ }
627
+ */
628
+
629
+ static int isnumpunc(char c)
630
+ {
631
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
632
+ || c == '-');
633
+ }
634
+
635
+ static int w_isnumpunc(wchar_t c)
636
+ {
637
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
638
+ || c == L'-');
639
+ }
640
+
641
+ static int isurlpunc(char c)
642
+ {
643
+ return (c == '.' || c == '/' || c == '-' || c == '_');
644
+ }
645
+
646
+ static int isurlc(char c)
647
+ {
648
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
649
+ }
650
+
651
+ static int isurlxatpunc(char c)
652
+ {
653
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
654
+ }
655
+
656
+ static int isurlxatc(char c)
657
+ {
658
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
659
+ || isalnum(c));
660
+ }
661
+
662
+ static bool std_is_tok_char(char *c)
663
+ {
664
+ if (isspace(*c)) {
665
+ return false; /* most common so check first. */
666
+ }
667
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
668
+ *c == '@' || *c == '\'' || *c == ':') {
669
+ return true;
670
+ }
671
+ return false;
672
+ }
673
+
674
+ static bool mb_std_is_tok_char(char *t)
675
+ {
676
+ wchar_t c;
677
+ mbstate_t state; ZEROSET(&state, mbstate_t);
678
+
679
+ if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
680
+ /* error which we can handle next time round. For now just return
681
+ * false so that we can return a token */
682
+ return false;
683
+ }
684
+ if (iswspace(c)) {
685
+ return false; /* most common so check first. */
686
+ }
687
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
688
+ || c == L':') {
689
+ return true;
690
+ }
691
+ return false;
692
+ }
693
+
694
+ /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
695
+ * least one digit.
696
+ * (alnum) = [a-zA-Z0-9]
697
+ * (punc) = [_\/.,-]
698
+ */
699
+ static int std_get_number(char *input)
700
+ {
701
+ int i = 0;
702
+ int count = 0;
703
+ int last_seen_digit = 2;
704
+ int seen_digit = false;
705
+
706
+ while (last_seen_digit >= 0) {
707
+ while ((input[i] != '\0') && isalnum(input[i])) {
708
+ if ((last_seen_digit < 2) && isdigit(input[i])) {
709
+ last_seen_digit = 2;
710
+ }
711
+ if ((seen_digit == false) && isdigit(input[i])) {
712
+ seen_digit = true;
713
+ }
714
+ i++;
715
+ }
716
+ last_seen_digit--;
717
+ if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
718
+
719
+ if (last_seen_digit >= 0) {
720
+ count = i;
721
+ }
722
+ break;
723
+ }
724
+ count = i;
725
+ i++;
726
+ }
727
+ if (seen_digit) {
728
+ return count;
729
+ }
730
+ else {
731
+ return 0;
732
+ }
733
+ }
734
+
735
+ static int std_get_apostrophe(char *input)
736
+ {
737
+ char *t = input;
738
+
739
+ while (isalpha(*t) || *t == '\'') {
740
+ t++;
741
+ }
742
+
743
+ return (int)(t - input);
744
+ }
745
+
746
+ static int mb_std_get_apostrophe(char *input)
747
+ {
748
+ char *t = input;
749
+ wchar_t wchr;
750
+ int i;
751
+ mbstate_t state; ZEROSET(&state, mbstate_t);
752
+
753
+ i = mb_next_char(&wchr, t, &state);
754
+
755
+ while (iswalpha(wchr) || wchr == L'\'') {
756
+ t += i;
757
+ i = mb_next_char(&wchr, t, &state);
758
+ }
759
+ return (int)(t - input);
760
+ }
761
+
762
+ static int std_get_url(char *input, char *token, int i)
763
+ {
764
+ while (isurlc(input[i])) {
765
+ if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
766
+ break; /* can't have two puncs in a row */
767
+ }
768
+ if (i < MAX_WORD_SIZE) {
769
+ token[i] = input[i];
770
+ }
771
+ i++;
772
+ }
773
+
774
+ /* strip trailing puncs */
775
+ while (isurlpunc(input[i - 1])) {
776
+ i--;
777
+ }
778
+
779
+ return i;
780
+ }
781
+
782
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
783
+ */
784
+ static int std_get_company_name(char *input)
785
+ {
786
+ int i = 0;
787
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
788
+ i++;
789
+ }
790
+
791
+ return i;
792
+ }
793
+
794
+ /*
795
+ static int mb_std_get_company_name(char *input, TokenStream *ts)
796
+ {
797
+ char *t = input;
798
+ wchar_t wchr;
799
+ int i;
800
+ mbstate_t state; ZEROSET(&state, mbstate_t);
801
+
802
+ i = mb_next_char(&wchr, t, &state);
803
+ while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
804
+ t += i;
805
+ i = mb_next_char(&wchr, t, &state);
806
+ }
807
+
808
+ return (int)(t - input);
809
+ }
810
+ */
811
+
812
+ static bool std_advance_to_start(TokenStream *ts)
813
+ {
814
+ char *t = ts->t;
815
+ while (*t != '\0' && !isalnum(*t)) {
816
+ if (isnumpunc(*t) && isdigit(t[1])) break;
817
+ t++;
818
+ }
819
+
820
+ ts->t = t;
821
+
822
+ return (*t != '\0');
823
+ }
824
+
825
+ static bool mb_std_advance_to_start(TokenStream *ts)
826
+ {
827
+ int i;
828
+ wchar_t wchr;
829
+ mbstate_t state; ZEROSET(&state, mbstate_t);
830
+
831
+ i = mb_next_char(&wchr, ts->t, &state);
832
+
833
+ while (wchr != 0 && !iswalnum(wchr)) {
834
+ if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
835
+ ts->t += i;
836
+ i = mb_next_char(&wchr, ts->t, &state);
837
+ }
838
+
839
+ return (wchr != 0);
840
+ }
841
+
842
+ static Token *std_next(TokenStream *ts)
843
+ {
844
+ StandardTokenizer *std_tz = STDTS(ts);
845
+ char *s;
846
+ char *t;
847
+ char *start = NULL;
848
+ char *num_end = NULL;
849
+ char token[MAX_WORD_SIZE + 1];
850
+ int token_i = 0;
851
+ int len;
852
+ bool is_acronym;
853
+ bool seen_at_symbol;
854
+
855
+
856
+ if (!std_tz->advance_to_start(ts)) {
857
+ return NULL;
858
+ }
859
+
860
+ start = t = ts->t;
861
+ token_i = std_tz->get_alpha(ts, token);
862
+ t += token_i;
863
+
864
+ if (!std_tz->is_tok_char(t)) {
865
+ /* very common case, ie a plain word, so check and return */
866
+ ts->t = t;
867
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
868
+ }
869
+
870
+ if (*t == '\'') { /* apostrophe case. */
871
+ t += std_tz->get_apostrophe(t);
872
+ ts->t = t;
873
+ len = (int)(t - start);
874
+ /* strip possesive */
875
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
876
+ t -= 2;
877
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
878
+ CTS(ts)->token.end += 2;
879
+ }
880
+ else if (t[-1] == '\'') {
881
+ t -= 1;
882
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
883
+ CTS(ts)->token.end += 1;
884
+ }
885
+ else {
886
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
887
+ }
888
+
889
+ return &(CTS(ts)->token);
890
+ }
891
+
892
+ if (*t == '&') { /* apostrophe case. */
893
+ t += std_get_company_name(t);
894
+ ts->t = t;
895
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
896
+ }
897
+
898
+ if ((isdigit(*t) || isnumpunc(*t)) /* possibly a number */
899
+ && (len = std_get_number(t) > 0)) {
900
+ num_end = start + len;
901
+ if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
902
+ ts->t = num_end;
903
+ return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
904
+ }
905
+ /* else there may be a longer token so check */
906
+ }
907
+
908
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
909
+ /* check for a known url start */
910
+ token[token_i] = '\0';
911
+ t += 3;
912
+ while (*t == '/') {
913
+ t++;
914
+ }
915
+ if (isalpha(*t) &&
916
+ (memcmp(token, "ftp", 3) == 0 ||
917
+ memcmp(token, "http", 4) == 0 ||
918
+ memcmp(token, "https", 5) == 0 ||
919
+ memcmp(token, "file", 4) == 0)) {
920
+ len = std_get_url(t, token, 0); /* dispose of first part of the URL */
921
+ }
922
+ else { /* still treat as url but keep the first part */
923
+ token_i = (int)(t - start);
924
+ memcpy(token, start, token_i * sizeof(char));
925
+ len = token_i + std_get_url(t, token, token_i); /* keep start */
926
+ }
927
+ ts->t = t + len;
928
+ token[len] = 0;
929
+ return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
930
+ (off_t)(ts->t - ts->text), 1);
931
+ }
932
+
933
+ /* now see how long a url we can find. */
934
+ is_acronym = true;
935
+ seen_at_symbol = false;
936
+ while (isurlxatc(*t)) {
937
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
938
+ is_acronym = false;
939
+ }
940
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
941
+ break; /* can't have two punctuation characters in a row */
942
+ }
943
+ if (*t == '@') {
944
+ if (seen_at_symbol) {
945
+ break; /* we can only have one @ symbol */
946
+ }
947
+ else {
948
+ seen_at_symbol = true;
949
+ }
950
+ }
951
+ t++;
952
+ }
953
+ while (isurlxatpunc(t[-1]) && t > ts->t) {
954
+ t--; /* strip trailing punctuation */
955
+ }
956
+
957
+ if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
958
+ fprintf(stderr, "Warning: encoding error. Please check that you are using the correct locale for your input");
959
+ return NULL;
960
+ } else if (num_end == NULL || t > num_end) {
961
+ ts->t = t;
962
+
963
+ if (is_acronym) { /* check it is one letter followed by one '.' */
964
+ for (s = start; s < t - 1; s++) {
965
+ if (isalpha(*s) && (s[1] != '.'))
966
+ is_acronym = false;
967
+ }
968
+ }
969
+ if (is_acronym) { /* strip '.'s */
970
+ for (s = start + token_i; s < t; s++) {
971
+ if (*s != '.') {
972
+ token[token_i] = *s;
973
+ token_i++;
974
+ }
975
+ }
976
+ tk_set(&(CTS(ts)->token), token, token_i,
977
+ (off_t)(start - ts->text),
978
+ (off_t)(t - ts->text), 1);
979
+ }
980
+ else { /* just return the url as is */
981
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
982
+ }
983
+ }
984
+ else { /* return the number */
985
+ ts->t = num_end;
986
+ tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
987
+ }
988
+
989
+ return &(CTS(ts)->token);
990
+ }
991
+
992
+ static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
993
+ {
994
+ return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
995
+ }
996
+
997
+ static TokenStream *std_ts_new()
998
+ {
999
+ TokenStream *ts = ts_new(StandardTokenizer);
1000
+
1001
+ ts->clone_i = &std_ts_clone_i;
1002
+ ts->next = &std_next;
1003
+
1004
+ return ts;
1005
+ }
1006
+
1007
+ TokenStream *standard_tokenizer_new()
1008
+ {
1009
+ TokenStream *ts = std_ts_new();
1010
+
1011
+ STDTS(ts)->advance_to_start = &std_advance_to_start;
1012
+ STDTS(ts)->get_alpha = &std_get_alpha;
1013
+ STDTS(ts)->is_tok_char = &std_is_tok_char;
1014
+ STDTS(ts)->get_apostrophe = &std_get_apostrophe;
1015
+
1016
+ return ts;
1017
+ }
1018
+
1019
+ TokenStream *mb_standard_tokenizer_new()
1020
+ {
1021
+ TokenStream *ts = std_ts_new();
1022
+
1023
+ STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
1024
+ STDTS(ts)->get_alpha = &mb_std_get_alpha;
1025
+ STDTS(ts)->is_tok_char = &mb_std_is_tok_char;
1026
+ STDTS(ts)->get_apostrophe = &mb_std_get_apostrophe;
1027
+
1028
+ return ts;
1029
+ }
1030
+
1031
+ /****************************************************************************
1032
+ *
1033
+ * Filters
1034
+ *
1035
+ ****************************************************************************/
1036
+
1037
+ #define TkFilt(filter) ((TokenFilter *)(filter))
1038
+
1039
+ TokenStream *filter_clone_size(TokenStream *ts, size_t size)
1040
+ {
1041
+ TokenStream *ts_new = ts_clone_size(ts, size);
1042
+ TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
1043
+ return ts_new;
1044
+ }
1045
+
1046
+ static TokenStream *filter_clone_i(TokenStream *ts)
1047
+ {
1048
+ return filter_clone_size(ts, sizeof(TokenFilter));
1049
+ }
1050
+
1051
+ static TokenStream *filter_reset(TokenStream *ts, char *text)
1052
+ {
1053
+ TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
1054
+ return ts;
1055
+ }
1056
+
1057
+ static void filter_destroy_i(TokenStream *ts)
1058
+ {
1059
+ ts_deref(TkFilt(ts)->sub_ts);
1060
+ free(ts);
1061
+ }
1062
+
1063
+ #define tf_new(type, sub) tf_new_i(sizeof(type), sub)
1064
+ TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
1065
+ {
1066
+ TokenStream *ts = (TokenStream *)ecalloc(size);
1067
+
1068
+ TkFilt(ts)->sub_ts = sub_ts;
1069
+
1070
+ ts->clone_i = &filter_clone_i;
1071
+ ts->destroy_i = &filter_destroy_i;
1072
+ ts->reset = &filter_reset;
1073
+ ts->ref_cnt = 1;
1074
+
1075
+ return ts;
1076
+ }
1077
+
1078
+ /****************************************************************************
1079
+ * StopFilter
1080
+ ****************************************************************************/
1081
+
1082
+ #define StopFilt(filter) ((StopFilter *)(filter))
1083
+
1084
+ static void sf_destroy_i(TokenStream *ts)
1085
+ {
1086
+ h_destroy(StopFilt(ts)->words);
1087
+ filter_destroy_i(ts);
1088
+ }
1089
+
1090
+ static TokenStream *sf_clone_i(TokenStream *orig_ts)
1091
+ {
1092
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
1093
+ REF(StopFilt(new_ts)->words);
1094
+ return new_ts;
1095
+ }
1096
+
1097
+ static Token *sf_next(TokenStream *ts)
1098
+ {
1099
+ int pos_inc = 0;
1100
+ HashTable *words = StopFilt(ts)->words;
1101
+ TokenFilter *tf = TkFilt(ts);
1102
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1103
+
1104
+ while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
1105
+ pos_inc += tk->pos_inc;
1106
+ tk = tf->sub_ts->next(tf->sub_ts);
1107
+ }
1108
+
1109
+ if (tk != NULL) {
1110
+ tk->pos_inc += pos_inc;
1111
+ }
1112
+
1113
+ return tk;
1114
+ }
1115
+
1116
+ TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
1117
+ const char **words, int len)
1118
+ {
1119
+ int i;
1120
+ char *word;
1121
+ HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1122
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
1123
+
1124
+ for (i = 0; i < len; i++) {
1125
+ word = estrdup(words[i]);
1126
+ h_set(word_table, word, word);
1127
+ }
1128
+ StopFilt(ts)->words = word_table;
1129
+ ts->next = &sf_next;
1130
+ ts->destroy_i = &sf_destroy_i;
1131
+ ts->clone_i = &sf_clone_i;
1132
+ return ts;
1133
+ }
1134
+
1135
+ TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
1136
+ const char **words)
1137
+ {
1138
+ char *word;
1139
+ HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1140
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
1141
+
1142
+ while (*words) {
1143
+ word = estrdup(*words);
1144
+ h_set(word_table, word, word);
1145
+ words++;
1146
+ }
1147
+
1148
+ StopFilt(ts)->words = word_table;
1149
+ ts->next = &sf_next;
1150
+ ts->destroy_i = &sf_destroy_i;
1151
+ ts->clone_i = &sf_clone_i;
1152
+ return ts;
1153
+ }
1154
+
1155
+ TokenStream *stop_filter_new(TokenStream *ts)
1156
+ {
1157
+ return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
1158
+ }
1159
+
1160
+ /****************************************************************************
1161
+ * MappingFilter
1162
+ ****************************************************************************/
1163
+
1164
+ #define MFilt(filter) ((MappingFilter *)(filter))
1165
+
1166
+ static void mf_destroy_i(TokenStream *ts)
1167
+ {
1168
+ mulmap_destroy(MFilt(ts)->mapper);
1169
+ filter_destroy_i(ts);
1170
+ }
1171
+
1172
+ static TokenStream *mf_clone_i(TokenStream *orig_ts)
1173
+ {
1174
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
1175
+ REF(MFilt(new_ts)->mapper);
1176
+ return new_ts;
1177
+ }
1178
+
1179
+ static Token *mf_next(TokenStream *ts)
1180
+ {
1181
+ char buf[MAX_WORD_SIZE + 1];
1182
+ MultiMapper *mapper = MFilt(ts)->mapper;
1183
+ TokenFilter *tf = TkFilt(ts);
1184
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1185
+ if (tk != NULL) {
1186
+ tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
1187
+ memcpy(tk->text, buf, tk->len + 1);
1188
+ }
1189
+ return tk;
1190
+ }
1191
+
1192
+ static TokenStream *mf_reset(TokenStream *ts, char *text)
1193
+ {
1194
+ MultiMapper *mm = MFilt(ts)->mapper;
1195
+ if (mm->d_size == 0) {
1196
+ mulmap_compile(MFilt(ts)->mapper);
1197
+ }
1198
+ filter_reset(ts, text);
1199
+ return ts;
1200
+ }
1201
+
1202
+ TokenStream *mapping_filter_new(TokenStream *sub_ts)
1203
+ {
1204
+ TokenStream *ts = tf_new(MappingFilter, sub_ts);
1205
+ MFilt(ts)->mapper = mulmap_new();
1206
+ ts->next = &mf_next;
1207
+ ts->destroy_i = &mf_destroy_i;
1208
+ ts->clone_i = &mf_clone_i;
1209
+ ts->reset = &mf_reset;
1210
+ return ts;
1211
+ }
1212
+
1213
+ TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
1214
+ const char *replacement)
1215
+ {
1216
+ mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
1217
+ return ts;
1218
+ }
1219
+
1220
+ /****************************************************************************
1221
+ * HyphenFilter
1222
+ ****************************************************************************/
1223
+
1224
+ #define HyphenFilt(filter) ((HyphenFilter *)(filter))
1225
+
1226
+ static TokenStream *hf_clone_i(TokenStream *orig_ts)
1227
+ {
1228
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
1229
+ return new_ts;
1230
+ }
1231
+
1232
+ static Token *hf_next(TokenStream *ts)
1233
+ {
1234
+ HyphenFilter *hf = HyphenFilt(ts);
1235
+ TokenFilter *tf = TkFilt(ts);
1236
+ Token *tk = hf->tk;
1237
+
1238
+ if (hf->pos < hf->len) {
1239
+ const int pos = hf->pos;
1240
+ const int text_len = strlen(hf->text + pos);
1241
+ strcpy(tk->text, hf->text + pos);
1242
+ tk->pos_inc = ((pos != 0) ? 1 : 0);
1243
+ tk->start = hf->start + pos;
1244
+ tk->end = tk->start + text_len;
1245
+ hf->pos += text_len + 1;
1246
+ tk->len = text_len;
1247
+ return tk;
1248
+ }
1249
+ else {
1250
+ char *p;
1251
+ bool seen_hyphen = false;
1252
+ bool seen_other_punc = false;
1253
+ hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1254
+ if (NULL == tk) return NULL;
1255
+ p = tk->text + 1;
1256
+ while (*p) {
1257
+ if (*p == '-') {
1258
+ seen_hyphen = true;
1259
+ }
1260
+ else if (!isalpha(*p)) {
1261
+ seen_other_punc = true;
1262
+ break;
1263
+ }
1264
+ p++;
1265
+ }
1266
+ if (seen_hyphen && !seen_other_punc) {
1267
+ char *q = hf->text;
1268
+ char *r = tk->text;
1269
+ p = tk->text;
1270
+ while (*p) {
1271
+ if (*p == '-') {
1272
+ *q = '\0';
1273
+ }
1274
+ else {
1275
+ *r = *q = *p;
1276
+ r++;
1277
+ }
1278
+ q++;
1279
+ p++;
1280
+ }
1281
+ *r = *q = '\0';
1282
+ hf->start = tk->start;
1283
+ hf->pos = 0;
1284
+ hf->len = q - hf->text;
1285
+ tk->len = r - tk->text;
1286
+ }
1287
+ }
1288
+ return tk;
1289
+ }
1290
+
1291
+ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
1292
+ {
1293
+ TokenStream *ts = tf_new(HyphenFilter, sub_ts);
1294
+ ts->next = &hf_next;
1295
+ ts->clone_i = &hf_clone_i;
1296
+ return ts;
1297
+ }
1298
+
1299
+ /****************************************************************************
1300
+ * LowerCaseFilter
1301
+ ****************************************************************************/
1302
+
1303
+
1304
+ Token *mb_lcf_next(TokenStream *ts)
1305
+ {
1306
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
1307
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1308
+ int x;
1309
+ wbuf[MAX_WORD_SIZE] = 0;
1310
+
1311
+ if (tk == NULL) {
1312
+ return tk;
1313
+ }
1314
+
1315
+ if ((x=mbstowcs(wbuf, tk->text, MAX_WORD_SIZE)) <= 0) return tk;
1316
+ wchr = wbuf;
1317
+ while (*wchr != 0) {
1318
+ *wchr = towlower(*wchr);
1319
+ wchr++;
1320
+ }
1321
+ tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
1322
+ if (tk->len <= 0) {
1323
+ strcpy(tk->text, "BAD_DATA");
1324
+ tk->len = 8;
1325
+ }
1326
+ tk->text[tk->len] = '\0';
1327
+ return tk;
1328
+ }
1329
+
1330
+ TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
1331
+ {
1332
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1333
+ ts->next = &mb_lcf_next;
1334
+ return ts;
1335
+ }
1336
+
1337
+ Token *lcf_next(TokenStream *ts)
1338
+ {
1339
+ int i = 0;
1340
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1341
+ if (tk == NULL) {
1342
+ return tk;
1343
+ }
1344
+ while (tk->text[i] != '\0') {
1345
+ tk->text[i] = tolower(tk->text[i]);
1346
+ i++;
1347
+ }
1348
+ return tk;
1349
+ }
1350
+
1351
+ TokenStream *lowercase_filter_new(TokenStream *sub_ts)
1352
+ {
1353
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1354
+ ts->next = &lcf_next;
1355
+ return ts;
1356
+ }
1357
+
1358
+ /****************************************************************************
1359
+ * StemFilter
1360
+ ****************************************************************************/
1361
+
1362
+ #define StemFilt(filter) ((StemFilter *)(filter))
1363
+
1364
+ void stemf_destroy_i(TokenStream *ts)
1365
+ {
1366
+ sb_stemmer_delete(StemFilt(ts)->stemmer);
1367
+ free(StemFilt(ts)->algorithm);
1368
+ free(StemFilt(ts)->charenc);
1369
+ filter_destroy_i(ts);
1370
+ }
1371
+
1372
+ Token *stemf_next(TokenStream *ts)
1373
+ {
1374
+ int len;
1375
+ const sb_symbol *stemmed;
1376
+ struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
1377
+ TokenFilter *tf = TkFilt(ts);
1378
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1379
+ if (tk == NULL) {
1380
+ return tk;
1381
+ }
1382
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
1383
+ len = sb_stemmer_length(stemmer);
1384
+ if (len >= MAX_WORD_SIZE) {
1385
+ len = MAX_WORD_SIZE - 1;
1386
+ }
1387
+
1388
+ memcpy(tk->text, stemmed, len);
1389
+ tk->text[len] = '\0';
1390
+ tk->len = len;
1391
+ return tk;
1392
+ }
1393
+
1394
+ TokenStream *stemf_clone_i(TokenStream *orig_ts)
1395
+ {
1396
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
1397
+ StemFilter *stemf = StemFilt(new_ts);
1398
+ StemFilter *orig_stemf = StemFilt(orig_ts);
1399
+ stemf->stemmer =
1400
+ sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1401
+ stemf->algorithm =
1402
+ orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1403
+ stemf->charenc =
1404
+ orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1405
+ return new_ts;
1406
+ }
1407
+
1408
+ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1409
+ const char *charenc)
1410
+ {
1411
+ TokenStream *tf = tf_new(StemFilter, ts);
1412
+
1413
+ StemFilt(tf)->stemmer = sb_stemmer_new(algorithm, charenc);
1414
+ StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
1415
+ StemFilt(tf)->charenc = charenc ? estrdup(charenc) : NULL;
1416
+
1417
+ tf->next = &stemf_next;
1418
+ tf->destroy_i = &stemf_destroy_i;
1419
+ tf->clone_i = &stemf_clone_i;
1420
+ return tf;
1421
+ }
1422
+
1423
+ /****************************************************************************
1424
+ *
1425
+ * Analyzers
1426
+ *
1427
+ ****************************************************************************/
1428
+
1429
+ /****************************************************************************
1430
+ * Standard
1431
+ ****************************************************************************/
1432
+
1433
+ Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
1434
+ bool lowercase)
1435
+ {
1436
+ TokenStream *ts = standard_tokenizer_new();
1437
+ if (lowercase) {
1438
+ ts = lowercase_filter_new(ts);
1439
+ }
1440
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1441
+ return analyzer_new(ts, NULL, NULL);
1442
+ }
1443
+
1444
+ Analyzer *standard_analyzer_new_with_words(const char **words,
1445
+ bool lowercase)
1446
+ {
1447
+ TokenStream *ts = standard_tokenizer_new();
1448
+ if (lowercase) {
1449
+ ts = lowercase_filter_new(ts);
1450
+ }
1451
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1452
+ return analyzer_new(ts, NULL, NULL);
1453
+ }
1454
+
1455
+ Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
1456
+ int len, bool lowercase)
1457
+ {
1458
+ TokenStream *ts = mb_standard_tokenizer_new();
1459
+ if (lowercase) {
1460
+ ts = mb_lowercase_filter_new(ts);
1461
+ }
1462
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1463
+ return analyzer_new(ts, NULL, NULL);
1464
+ }
1465
+
1466
+ Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1467
+ bool lowercase)
1468
+ {
1469
+ TokenStream *ts = mb_standard_tokenizer_new();
1470
+ if (lowercase) {
1471
+ ts = mb_lowercase_filter_new(ts);
1472
+ }
1473
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1474
+ return analyzer_new(ts, NULL, NULL);
1475
+ }
1476
+
1477
+ Analyzer *standard_analyzer_new(bool lowercase)
1478
+ {
1479
+ return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1480
+ lowercase);
1481
+ }
1482
+
1483
+ Analyzer *mb_standard_analyzer_new(bool lowercase)
1484
+ {
1485
+ return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1486
+ lowercase);
1487
+ }
1488
+
1489
+ /****************************************************************************
1490
+ *
1491
+ * PerFieldAnalyzer
1492
+ *
1493
+ ****************************************************************************/
1494
+
1495
+ #define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
1496
+ void pfa_destroy_i(Analyzer *self)
1497
+ {
1498
+ h_destroy(PFA(self)->dict);
1499
+
1500
+ a_deref(PFA(self)->default_a);
1501
+ free(self);
1502
+ }
1503
+
1504
+ TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
1505
+ {
1506
+ Analyzer *a = h_get(PFA(self)->dict, field);
1507
+ if (a == NULL) {
1508
+ a = PFA(self)->default_a;
1509
+ }
1510
+ return a_get_ts(a, field, text);
1511
+ }
1512
+
1513
+ void pfa_sub_a_destroy_i(void *p)
1514
+ {
1515
+ Analyzer *a = (Analyzer *) p;
1516
+ a_deref(a);
1517
+ }
1518
+
1519
+ void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
1520
+ {
1521
+ h_set(PFA(self)->dict, estrdup(field), analyzer);
1522
+ }
1523
+
1524
+ Analyzer *per_field_analyzer_new(Analyzer *default_a)
1525
+ {
1526
+ Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
1527
+
1528
+ PFA(a)->default_a = default_a;
1529
+ PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
1530
+
1531
+ a->destroy_i = &pfa_destroy_i;
1532
+ a->get_ts = pfa_get_ts;
1533
+ a->ref_cnt = 1;
1534
+
1535
+ return a;
1536
+ }
1537
+
1538
+ #ifdef ALONE
1539
+ int main(int argc, char **argv)
1540
+ {
1541
+ char buf[10000];
1542
+ Analyzer *a = standard_analyzer_new(true);
1543
+ TokenStream *ts;
1544
+ Token *tk;
1545
+ while (fgets(buf, 9999, stdin) != NULL) {
1546
+ ts = a_get_ts(a, "hello", buf);
1547
+ while ((tk = ts->next(ts)) != NULL) {
1548
+ printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
1549
+ }
1550
+ printf("\n");
1551
+ ts_deref(ts);
1552
+ }
1553
+ return 0;
1554
+ }
1555
+ #endif