np-ferret 0.11.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (275) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/Makefile +218 -0
  9. data/ext/analysis.c +1584 -0
  10. data/ext/analysis.h +219 -0
  11. data/ext/analysis.o +0 -0
  12. data/ext/api.c +69 -0
  13. data/ext/api.h +27 -0
  14. data/ext/api.o +0 -0
  15. data/ext/array.c +123 -0
  16. data/ext/array.h +53 -0
  17. data/ext/array.o +0 -0
  18. data/ext/bitvector.c +540 -0
  19. data/ext/bitvector.h +272 -0
  20. data/ext/bitvector.o +0 -0
  21. data/ext/compound_io.c +383 -0
  22. data/ext/compound_io.o +0 -0
  23. data/ext/config.h +42 -0
  24. data/ext/document.c +156 -0
  25. data/ext/document.h +53 -0
  26. data/ext/document.o +0 -0
  27. data/ext/except.c +120 -0
  28. data/ext/except.h +168 -0
  29. data/ext/except.o +0 -0
  30. data/ext/extconf.rb +14 -0
  31. data/ext/ferret.c +402 -0
  32. data/ext/ferret.h +91 -0
  33. data/ext/ferret.o +0 -0
  34. data/ext/ferret_ext.bundle +0 -0
  35. data/ext/filter.c +156 -0
  36. data/ext/filter.o +0 -0
  37. data/ext/fs_store.c +484 -0
  38. data/ext/fs_store.o +0 -0
  39. data/ext/global.c +418 -0
  40. data/ext/global.h +117 -0
  41. data/ext/global.o +0 -0
  42. data/ext/hash.c +598 -0
  43. data/ext/hash.h +475 -0
  44. data/ext/hash.o +0 -0
  45. data/ext/hashset.c +170 -0
  46. data/ext/hashset.h +187 -0
  47. data/ext/hashset.o +0 -0
  48. data/ext/header.h +58 -0
  49. data/ext/helper.c +62 -0
  50. data/ext/helper.h +13 -0
  51. data/ext/helper.o +0 -0
  52. data/ext/inc/lang.h +48 -0
  53. data/ext/inc/threading.h +31 -0
  54. data/ext/index.c +6510 -0
  55. data/ext/index.h +964 -0
  56. data/ext/index.o +0 -0
  57. data/ext/lang.h +66 -0
  58. data/ext/libstemmer.c +92 -0
  59. data/ext/libstemmer.h +79 -0
  60. data/ext/libstemmer.o +0 -0
  61. data/ext/mempool.c +87 -0
  62. data/ext/mempool.h +35 -0
  63. data/ext/mempool.o +0 -0
  64. data/ext/modules.h +162 -0
  65. data/ext/multimapper.c +310 -0
  66. data/ext/multimapper.h +51 -0
  67. data/ext/multimapper.o +0 -0
  68. data/ext/posh.c +1006 -0
  69. data/ext/posh.h +1007 -0
  70. data/ext/posh.o +0 -0
  71. data/ext/priorityqueue.c +151 -0
  72. data/ext/priorityqueue.h +143 -0
  73. data/ext/priorityqueue.o +0 -0
  74. data/ext/q_boolean.c +1608 -0
  75. data/ext/q_boolean.o +0 -0
  76. data/ext/q_const_score.c +165 -0
  77. data/ext/q_const_score.o +0 -0
  78. data/ext/q_filtered_query.c +209 -0
  79. data/ext/q_filtered_query.o +0 -0
  80. data/ext/q_fuzzy.c +335 -0
  81. data/ext/q_fuzzy.o +0 -0
  82. data/ext/q_match_all.c +148 -0
  83. data/ext/q_match_all.o +0 -0
  84. data/ext/q_multi_term.c +677 -0
  85. data/ext/q_multi_term.o +0 -0
  86. data/ext/q_parser.c +2825 -0
  87. data/ext/q_parser.o +0 -0
  88. data/ext/q_phrase.c +1126 -0
  89. data/ext/q_phrase.o +0 -0
  90. data/ext/q_prefix.c +100 -0
  91. data/ext/q_prefix.o +0 -0
  92. data/ext/q_range.c +356 -0
  93. data/ext/q_range.o +0 -0
  94. data/ext/q_span.c +2402 -0
  95. data/ext/q_span.o +0 -0
  96. data/ext/q_term.c +337 -0
  97. data/ext/q_term.o +0 -0
  98. data/ext/q_wildcard.c +171 -0
  99. data/ext/q_wildcard.o +0 -0
  100. data/ext/r_analysis.c +2636 -0
  101. data/ext/r_analysis.o +0 -0
  102. data/ext/r_index.c +3509 -0
  103. data/ext/r_index.o +0 -0
  104. data/ext/r_qparser.c +585 -0
  105. data/ext/r_qparser.o +0 -0
  106. data/ext/r_search.c +4240 -0
  107. data/ext/r_search.o +0 -0
  108. data/ext/r_store.c +513 -0
  109. data/ext/r_store.o +0 -0
  110. data/ext/r_utils.c +963 -0
  111. data/ext/r_utils.o +0 -0
  112. data/ext/ram_store.c +471 -0
  113. data/ext/ram_store.o +0 -0
  114. data/ext/search.c +1743 -0
  115. data/ext/search.h +885 -0
  116. data/ext/search.o +0 -0
  117. data/ext/similarity.c +150 -0
  118. data/ext/similarity.h +82 -0
  119. data/ext/similarity.o +0 -0
  120. data/ext/sort.c +985 -0
  121. data/ext/sort.o +0 -0
  122. data/ext/stem_ISO_8859_1_danish.c +338 -0
  123. data/ext/stem_ISO_8859_1_danish.h +16 -0
  124. data/ext/stem_ISO_8859_1_danish.o +0 -0
  125. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  126. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.o +0 -0
  128. data/ext/stem_ISO_8859_1_english.c +1156 -0
  129. data/ext/stem_ISO_8859_1_english.h +16 -0
  130. data/ext/stem_ISO_8859_1_english.o +0 -0
  131. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  132. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  133. data/ext/stem_ISO_8859_1_finnish.o +0 -0
  134. data/ext/stem_ISO_8859_1_french.c +1276 -0
  135. data/ext/stem_ISO_8859_1_french.h +16 -0
  136. data/ext/stem_ISO_8859_1_french.o +0 -0
  137. data/ext/stem_ISO_8859_1_german.c +512 -0
  138. data/ext/stem_ISO_8859_1_german.h +16 -0
  139. data/ext/stem_ISO_8859_1_german.o +0 -0
  140. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  141. data/ext/stem_ISO_8859_1_italian.h +16 -0
  142. data/ext/stem_ISO_8859_1_italian.o +0 -0
  143. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  144. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  145. data/ext/stem_ISO_8859_1_norwegian.o +0 -0
  146. data/ext/stem_ISO_8859_1_porter.c +776 -0
  147. data/ext/stem_ISO_8859_1_porter.h +16 -0
  148. data/ext/stem_ISO_8859_1_porter.o +0 -0
  149. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  150. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  151. data/ext/stem_ISO_8859_1_portuguese.o +0 -0
  152. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  153. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  154. data/ext/stem_ISO_8859_1_spanish.o +0 -0
  155. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  156. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  157. data/ext/stem_ISO_8859_1_swedish.o +0 -0
  158. data/ext/stem_KOI8_R_russian.c +701 -0
  159. data/ext/stem_KOI8_R_russian.h +16 -0
  160. data/ext/stem_KOI8_R_russian.o +0 -0
  161. data/ext/stem_UTF_8_danish.c +344 -0
  162. data/ext/stem_UTF_8_danish.h +16 -0
  163. data/ext/stem_UTF_8_danish.o +0 -0
  164. data/ext/stem_UTF_8_dutch.c +653 -0
  165. data/ext/stem_UTF_8_dutch.h +16 -0
  166. data/ext/stem_UTF_8_dutch.o +0 -0
  167. data/ext/stem_UTF_8_english.c +1176 -0
  168. data/ext/stem_UTF_8_english.h +16 -0
  169. data/ext/stem_UTF_8_english.o +0 -0
  170. data/ext/stem_UTF_8_finnish.c +808 -0
  171. data/ext/stem_UTF_8_finnish.h +16 -0
  172. data/ext/stem_UTF_8_finnish.o +0 -0
  173. data/ext/stem_UTF_8_french.c +1296 -0
  174. data/ext/stem_UTF_8_french.h +16 -0
  175. data/ext/stem_UTF_8_french.o +0 -0
  176. data/ext/stem_UTF_8_german.c +526 -0
  177. data/ext/stem_UTF_8_german.h +16 -0
  178. data/ext/stem_UTF_8_german.o +0 -0
  179. data/ext/stem_UTF_8_italian.c +1113 -0
  180. data/ext/stem_UTF_8_italian.h +16 -0
  181. data/ext/stem_UTF_8_italian.o +0 -0
  182. data/ext/stem_UTF_8_norwegian.c +302 -0
  183. data/ext/stem_UTF_8_norwegian.h +16 -0
  184. data/ext/stem_UTF_8_norwegian.o +0 -0
  185. data/ext/stem_UTF_8_porter.c +794 -0
  186. data/ext/stem_UTF_8_porter.h +16 -0
  187. data/ext/stem_UTF_8_porter.o +0 -0
  188. data/ext/stem_UTF_8_portuguese.c +1055 -0
  189. data/ext/stem_UTF_8_portuguese.h +16 -0
  190. data/ext/stem_UTF_8_portuguese.o +0 -0
  191. data/ext/stem_UTF_8_russian.c +709 -0
  192. data/ext/stem_UTF_8_russian.h +16 -0
  193. data/ext/stem_UTF_8_russian.o +0 -0
  194. data/ext/stem_UTF_8_spanish.c +1137 -0
  195. data/ext/stem_UTF_8_spanish.h +16 -0
  196. data/ext/stem_UTF_8_spanish.o +0 -0
  197. data/ext/stem_UTF_8_swedish.c +313 -0
  198. data/ext/stem_UTF_8_swedish.h +16 -0
  199. data/ext/stem_UTF_8_swedish.o +0 -0
  200. data/ext/stopwords.c +401 -0
  201. data/ext/stopwords.o +0 -0
  202. data/ext/store.c +692 -0
  203. data/ext/store.h +777 -0
  204. data/ext/store.o +0 -0
  205. data/ext/term_vectors.c +352 -0
  206. data/ext/term_vectors.o +0 -0
  207. data/ext/threading.h +31 -0
  208. data/ext/utilities.c +446 -0
  209. data/ext/utilities.o +0 -0
  210. data/ext/win32.h +54 -0
  211. data/ferret.gemspec +39 -0
  212. data/lib/ferret.rb +29 -0
  213. data/lib/ferret/browser.rb +246 -0
  214. data/lib/ferret/browser/s/global.js +192 -0
  215. data/lib/ferret/browser/s/style.css +148 -0
  216. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  217. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  218. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  219. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  220. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  221. data/lib/ferret/browser/views/layout.rhtml +22 -0
  222. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  223. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  224. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  225. data/lib/ferret/browser/webrick.rb +14 -0
  226. data/lib/ferret/document.rb +130 -0
  227. data/lib/ferret/field_infos.rb +44 -0
  228. data/lib/ferret/index.rb +786 -0
  229. data/lib/ferret/number_tools.rb +157 -0
  230. data/lib/ferret_ext.bundle +0 -0
  231. data/lib/ferret_version.rb +3 -0
  232. data/pkg/ferret-0.11.6.gem +0 -0
  233. data/pkg/ferret-0.11.6.tgz +0 -0
  234. data/pkg/ferret-0.11.6.zip +0 -0
  235. data/setup.rb +1555 -0
  236. data/test/test_all.rb +5 -0
  237. data/test/test_helper.rb +24 -0
  238. data/test/threading/number_to_spoken.rb +132 -0
  239. data/test/threading/thread_safety_index_test.rb +79 -0
  240. data/test/threading/thread_safety_read_write_test.rb +76 -0
  241. data/test/threading/thread_safety_test.rb +133 -0
  242. data/test/unit/analysis/tc_analyzer.rb +548 -0
  243. data/test/unit/analysis/tc_token_stream.rb +646 -0
  244. data/test/unit/index/tc_index.rb +762 -0
  245. data/test/unit/index/tc_index_reader.rb +699 -0
  246. data/test/unit/index/tc_index_writer.rb +437 -0
  247. data/test/unit/index/th_doc.rb +315 -0
  248. data/test/unit/largefile/tc_largefile.rb +46 -0
  249. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  250. data/test/unit/search/tc_filter.rb +135 -0
  251. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  252. data/test/unit/search/tc_index_searcher.rb +61 -0
  253. data/test/unit/search/tc_multi_searcher.rb +128 -0
  254. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  255. data/test/unit/search/tc_search_and_sort.rb +179 -0
  256. data/test/unit/search/tc_sort.rb +49 -0
  257. data/test/unit/search/tc_sort_field.rb +27 -0
  258. data/test/unit/search/tc_spans.rb +190 -0
  259. data/test/unit/search/tm_searcher.rb +384 -0
  260. data/test/unit/store/tc_fs_store.rb +77 -0
  261. data/test/unit/store/tc_ram_store.rb +35 -0
  262. data/test/unit/store/tm_store.rb +34 -0
  263. data/test/unit/store/tm_store_lock.rb +68 -0
  264. data/test/unit/tc_document.rb +81 -0
  265. data/test/unit/ts_analysis.rb +2 -0
  266. data/test/unit/ts_index.rb +2 -0
  267. data/test/unit/ts_largefile.rb +4 -0
  268. data/test/unit/ts_query_parser.rb +2 -0
  269. data/test/unit/ts_search.rb +2 -0
  270. data/test/unit/ts_store.rb +2 -0
  271. data/test/unit/ts_utils.rb +2 -0
  272. data/test/unit/utils/tc_bit_vector.rb +295 -0
  273. data/test/unit/utils/tc_number_tools.rb +117 -0
  274. data/test/unit/utils/tc_priority_queue.rb +106 -0
  275. metadata +392 -0
data/TODO ADDED
@@ -0,0 +1,17 @@
1
+ = TODO
2
+
3
+ * user defined sorting
4
+ * add field compression
5
+ * Fix highlighting to work for compressed fields
6
+ * Fix highlighting to work for external fields
7
+ * Add Ferret::Index::Index
8
+ * Fix:
9
+ > Working Query: field1:value1 AND NOT field2:value2
10
+ > Failing Query: field1:value1 AND ( NOT field2:value2 )
11
+
12
+ = Done
13
+ * Add string Sort descripter
14
+ * fix memory bug
15
+ * add MultiReader interface
16
+ * add lexicographical sort (byte sort)
17
+ * Add highlighting
data/TUTORIAL ADDED
@@ -0,0 +1,231 @@
1
+ = Quick Introduction to Ferret
2
+
3
+ The simplest way to use Ferret is through the Ferret::Index::Index class.
4
+ This is now aliased by Ferret::I for quick and easy access. Start by including
5
+ the Ferret module.
6
+
7
+ require 'ferret'
8
+ include Ferret
9
+
10
+ === Creating an index
11
+
12
+ To create an in memory index is very simple;
13
+
14
+ index = Index::Index.new()
15
+
16
+ To create a persistent index;
17
+
18
+ index = Index::Index.new(:path => '/path/to/index')
19
+
20
+ Both of these methods create new Indexes with the StandardAnalyzer. An
21
+ analyzer is what you use to divide the input data up into tokens which you can
22
+ search for later. If you'd like to use a different analyzer you can specify it
23
+ here, eg;
24
+
25
+ index = Index::Index.new(:path => '/path/to/index',
26
+ :analyzer => Analysis::WhiteSpaceAnalyzer.new)
27
+
28
+ For more options when creating an Index refer to Ferret::Index::Index.
29
+
30
+ === Adding Documents
31
+
32
+ To add a document you can simply add a string or an array of strings. This will
33
+ store all the strings in the "" (ie empty string) field (unless you specify the
34
+ default field when you create the index).
35
+
36
+ index << "This is a new document to be indexed"
37
+ index << ["And here", "is another", "new document", "to be indexed"]
38
+
39
+ But these are pretty simple documents. If this is all you want to index you
40
+ could probably just use SimpleSearch. So let's give our documents some fields;
41
+
42
+ index << {:title => "Programming Ruby", :content => "blah blah blah"}
43
+ index << {:title => "Programming Ruby", :content => "yada yada yada"}
44
+
45
+ Note the way that all field-names are Symbols. Although Strings will work,
46
+ this is a best-practice in Ferret. Or if you are indexing data stored in a
47
+ database, you'll probably want to store the id;
48
+
49
+ index << {:id => row.id, :title => row.title, :date => row.date}
50
+
51
+ So far we have been storing and tokenizing all of the input data along with
52
+ term vectors. If we want to change this we need to change the way we setup the
53
+ index. You must create a FieldInfos object describing the index:
54
+
55
+ field_infos = FieldInfos.new(:store => :no,
56
+ :index => :untokenized_omit_norms,
57
+ :term_vector => :no)
58
+
59
+ The values that you set FieldInfos to have will be used by default by all
60
+ fields. If you want to change the properties for specific fields, you need to
61
+ add a FieldInfo to field_infos.
62
+
63
+ field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
64
+ field_infos.add_field(:content, :store => :yes,
65
+ :index => :yes,
66
+ :term_vector => :with_positions_offsets)
67
+
68
+ If you need to add a field to an already open index you do so like this:
69
+
70
+ index.field_infos.add_field(:new_field, :store => :yes)
71
+
72
+ === Searching
73
+
74
+ Now that we have data in our index, how do we actually use this index to
75
+ search the data? The Index offers two search methods, Index#search and
76
+ Index#search_each. The first method returns a Ferret::Index::TopDocs object.
77
+ The second we'll show here. Lets say we wanted to find all documents with the
78
+ phrase "quick brown fox" in the content field. We'd write;
79
+
80
+ index.search_each('content:"quick brown fox"') do |id, score|
81
+ puts "Document #{id} found with a score of #{score}"
82
+ end
83
+
84
+ But "fast" has a pretty similar meaning to "quick" and we don't mind if the
85
+ fox is a little red. Also, the phrase could be in the title so we'll search
86
+ there as well. So we could expand our search like this;
87
+
88
+ index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
89
+ puts "Document #{id} found with a score of #{score}"
90
+ end
91
+
92
+ What if we want to find all documents entered on or after 5th of September,
93
+ 2005 with the words "ruby" or "rails" in any field. We could type something like;
94
+
95
+ index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
96
+ puts "Document #{index[id][:title]} found with a score of #{score}"
97
+ end
98
+
99
+ Ferret has quite a complex query language. To find out more about Ferret's
100
+ query language, see Ferret::QueryParser. You can also construct even more
101
+ complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
102
+ for more information.
103
+
104
+ === Highlighting
105
+
106
+ Ferret now has a super-fast highlighting method. See
107
+ Ferret::Index::Index#highlight. Here is an example of how you would use it
108
+ when printing to the console:
109
+
110
+ index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
111
+ puts "Document #{index[id][:title]} found with a score of #{score}"
112
+ highlights = index.highlight("content:(ruby OR rails)", 0,
113
+ :field => :content,
114
+ :pre_tag = "\033[36m",
115
+ :post_tag = "\033[m")
116
+ puts highlights
117
+ end
118
+
119
+ And if you want to highlight a whole document, set :excerpt_length to :all:
120
+
121
+ puts index.highlight(query, doc_id,
122
+ :field => :content,
123
+ :pre_tag = "\033[36m",
124
+ :post_tag = "\033[m",
125
+ :excerpt_length => :all)
126
+
127
+ === Accessing Documents
128
+
129
+ You may have noticed that when we run a search we only get the document id
130
+ back. By itself this isn't much use to us. Getting the data from the index is
131
+ very straightforward. For example if we want the :title field form the 3rd
132
+ document type;
133
+
134
+ index[2][:title]
135
+
136
+ Documents are lazy loading so if you try this:
137
+
138
+ puts index[2]
139
+
140
+ You will always get an empty Hash. To load all fields, call the load method:
141
+
142
+ puts index[2].load
143
+
144
+ NOTE: documents are indexed from 0. You can also use array-like index
145
+ parameters to access index. For example
146
+
147
+ index[1..4]
148
+ index[10, 10]
149
+ index[-5]
150
+
151
+ The default field is :id (although you can change this with index's
152
+ :default_create_field parameter);
153
+
154
+ index << "This is a document"
155
+ index[0][:id]
156
+
157
+ Let's go back to the database example above. If we store all of our documents
158
+ with an id then we can access that field using the id. As long as we called
159
+ our id field :id we can do this
160
+
161
+ index["89721347"]["title"]
162
+
163
+ Pretty simple huh? You should note though that if there are more then one
164
+ document with the same *id* or *key* then only the first one will be returned
165
+ so it is probably better that you ensure the key is unique somehow. By setting
166
+ Index's :key attribute to :id, Ferret will do this automatically for you. It
167
+ can even handle multiple field primary keys. For example, you could set to
168
+ :key to [:id, :model] and Ferret would keep the documents unique for that pair
169
+ of fields.
170
+
171
+ === Modifying and Deleting Documents
172
+
173
+ What if we want to change the data in the index. Ferret doesn't actually let
174
+ you change the data once it is in the index. But you can delete documents so
175
+ the standard way to modify data is to delete it and re-add it again with the
176
+ modifications made. It is important to note that when doing this the documents
177
+ will get a new document number so you should be careful not to use a document
178
+ number after the document has been deleted. Here is an example of modifying a
179
+ document;
180
+
181
+ index << {:title => "Programing Rbuy", :content => "blah blah blah"}
182
+ doc_num = nil
183
+ index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
184
+ return unless doc_id
185
+ doc = index[doc_id]
186
+ index.delete(doc_id)
187
+
188
+ # modify doc. It is just a Hash after all
189
+ doc[:title] = "Programming Ruby"
190
+
191
+ index << doc
192
+
193
+ If you set the :key parameter as described in the last section there is no
194
+ need to delete the document. It will be automatically deleted when you add
195
+ another document with the same key.
196
+
197
+ Also, we can use the id field, as above, to delete documents. This time though
198
+ every document that matches the id will be deleted. Again, it is probably a
199
+ good idea if you somehow ensure that your *ids* are kept unique.
200
+
201
+ id = "23453422"
202
+ index.delete(id)
203
+
204
+ === Onwards
205
+
206
+ This is just a small sampling of what Ferret allows you to do. Ferret, like
207
+ Lucene, is designed to be extended, and allows you to construct your own query
208
+ types, analyzers, and so on. Going onwards you should check out the following
209
+ documentation:
210
+
211
+ * Ferret::Analysis: for more information on how the data is processed when it
212
+ is tokenized. There are a number of things you can do with your data such as
213
+ adding stop lists or perhaps a porter stemmer. There are also a number of
214
+ analyzers already available and it is almost trivial to create a new one
215
+ with a simple regular expression.
216
+
217
+ * Ferret::Search: for more information on querying the index. There are a
218
+ number of already available queries and it's unlikely you'll need to create
219
+ your own. You may however want to take advantage of the sorting or filtering
220
+ abilities of Ferret to present your data the best way you see fit.
221
+
222
+ * Ferret::QueryParser: if you want to find out more about what you can do with
223
+ Ferret's Query Parser, this is the place to look. The query parser is one
224
+ area that could use a bit of work so please send your suggestions.
225
+
226
+ * Ferret::Index: for more advanced access to the index you'll probably want to
227
+ use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
228
+ the place to look for more information on them.
229
+
230
+ * Ferret::Store: This is the module used to access the actual index storage
231
+ and won't be of much interest to most people.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
4
+ require 'ferret'
5
+ require 'ferret/browser'
6
+
7
+ require 'optparse'
8
+ require 'ostruct'
9
+
10
+ SERVER_OPTIONS = ['webrick']
11
+ conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
12
+
13
+ opts = OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{File.basename($0)} /path/to/index"
15
+ opts.separator ""
16
+ opts.separator "Specific Options:"
17
+
18
+ opts.on("-h", "--host HOSTNAME",
19
+ "Host for web server to bind to (default is all IPs)") { |conf.host| }
20
+ opts.on("-p", "--port NUM",
21
+ "Port for web server (defaults to #{conf.port})") { |conf.port| }
22
+ opts.on("-s", "--server NAME",
23
+ "Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
24
+
25
+ opts.separator ""
26
+ opts.separator "Common options:"
27
+
28
+ opts.on_tail("-?", "--help", "Show this message") do
29
+ puts opts
30
+ exit
31
+ end
32
+
33
+ opts.on_tail("-v", "--version", "Show version") do
34
+ puts Ferret::VERSION
35
+ exit
36
+ end
37
+ end
38
+
39
+ opts.parse! ARGV
40
+ if ARGV.length != 1
41
+ puts opts
42
+ exit
43
+ end
44
+ @path = ARGV[0]
45
+
46
+ # Load the Ferret index
47
+ begin
48
+ @reader = Ferret::Index::IndexReader.new(@path)
49
+ rescue Ferret::FileNotFoundError => e
50
+ puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
51
+ "\033[33m#{@path}\033[m\033[31m\".\033[m"
52
+ exit
53
+ rescue Exception => e
54
+ puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
55
+ exit
56
+ end
57
+
58
+ unless conf.server
59
+ conf.server = :webrick
60
+ end
61
+
62
+ case conf.server.to_s
63
+ when 'webrick'
64
+ require 'webrick/httpserver'
65
+ require 'ferret/browser/webrick'
66
+
67
+ # Mount the root
68
+ s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
69
+ s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
70
+ s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
71
+
72
+ # Server up
73
+ trap(:INT) do
74
+ s.shutdown
75
+ end
76
+ s.start
77
+ else
78
+ raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
79
+ end
data/ext/Makefile ADDED
@@ -0,0 +1,218 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ n=$(NULLCMD)
9
+ ECHO1 = $(V:1=@$n)
10
+ ECHO = $(ECHO1:0=@echo)
11
+
12
+ #### Start of system configuration section. ####
13
+
14
+ srcdir = .
15
+ topdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1
16
+ hdrdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1
17
+ arch_hdrdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1/$(arch)
18
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
19
+ prefix = $(DESTDIR)/Users/ehanson/.rbenv/versions/1.9.3-p392
20
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
21
+ exec_prefix = $(prefix)
22
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
23
+ sitehdrdir = $(rubyhdrdir)/site_ruby
24
+ rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
25
+ vendordir = $(rubylibprefix)/vendor_ruby
26
+ sitedir = $(rubylibprefix)/site_ruby
27
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
28
+ mandir = $(datarootdir)/man
29
+ localedir = $(datarootdir)/locale
30
+ libdir = $(exec_prefix)/lib
31
+ psdir = $(docdir)
32
+ pdfdir = $(docdir)
33
+ dvidir = $(docdir)
34
+ htmldir = $(docdir)
35
+ infodir = $(datarootdir)/info
36
+ docdir = $(datarootdir)/doc/$(PACKAGE)
37
+ oldincludedir = $(DESTDIR)/usr/include
38
+ includedir = $(prefix)/include
39
+ localstatedir = $(prefix)/var
40
+ sharedstatedir = $(prefix)/com
41
+ sysconfdir = $(prefix)/etc
42
+ datadir = $(datarootdir)
43
+ datarootdir = $(prefix)/share
44
+ libexecdir = $(exec_prefix)/libexec
45
+ sbindir = $(exec_prefix)/sbin
46
+ bindir = $(exec_prefix)/bin
47
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
48
+ archdir = $(rubylibdir)/$(arch)
49
+ sitelibdir = $(sitedir)/$(ruby_version)
50
+ sitearchdir = $(sitelibdir)/$(sitearch)
51
+ vendorlibdir = $(vendordir)/$(ruby_version)
52
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
53
+
54
+ NULLCMD = :
55
+
56
+ CC = gcc
57
+ CXX = g++
58
+ LIBRUBY = $(LIBRUBY_A)
59
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
60
+ LIBRUBYARG_SHARED =
61
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
62
+ empty =
63
+ OUTFLAG = -o $(empty)
64
+ COUTFLAG = -o $(empty)
65
+
66
+ RUBY_EXTCONF_H =
67
+ cflags = $(optflags) $(debugflags) $(warnflags)
68
+ optflags = -O3
69
+ debugflags = -ggdb
70
+ warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration
71
+ CFLAGS = -fno-common -O3 -Wno-error=shorten-64-to-32 -pipe -D_FILE_OFFSET_BITS=64 $(ARCH_FLAG)
72
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
73
+ DEFS =
74
+ CPPFLAGS = -I'/Users/ehanson/.rbenv/versions/1.9.3-p392/include' -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
75
+ CXXFLAGS = $(CFLAGS) $(cxxflags)
76
+ ldflags = -L. -L'/Users/ehanson/.rbenv/versions/1.9.3-p392/lib' -L/usr/local/lib
77
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
78
+ ARCH_FLAG =
79
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
80
+ LDSHARED = $(CC) -dynamic -bundle
81
+ LDSHAREDXX = $(CXX) -dynamic -bundle
82
+ AR = ar
83
+ EXEEXT =
84
+
85
+ RUBY_BASE_NAME = ruby
86
+ RUBY_INSTALL_NAME = ruby
87
+ RUBY_SO_NAME = ruby
88
+ arch = x86_64-darwin12.2.1
89
+ sitearch = $(arch)
90
+ ruby_version = 1.9.1
91
+ ruby = /Users/ehanson/.rbenv/versions/1.9.3-p392/bin/ruby
92
+ RUBY = $(ruby)
93
+ RM = rm -f
94
+ RM_RF = $(RUBY) -run -e rm -- -rf
95
+ RMDIRS = rmdir -p
96
+ MAKEDIRS = mkdir -p
97
+ INSTALL = /usr/bin/install -c
98
+ INSTALL_PROG = $(INSTALL) -m 0755
99
+ INSTALL_DATA = $(INSTALL) -m 644
100
+ COPY = cp
101
+ TOUCH = exit >
102
+
103
+ #### End of system configuration section. ####
104
+
105
+ preload =
106
+
107
+ libpath = . $(libdir)
108
+ LIBPATH = -L. -L$(libdir)
109
+ DEFFILE =
110
+
111
+ CLEANFILES = mkmf.log
112
+ DISTCLEANFILES =
113
+ DISTCLEANDIRS =
114
+
115
+ extout =
116
+ extout_prefix =
117
+ target_prefix =
118
+ LOCAL_LIBS =
119
+ LIBS = -lpthread -ldl -lobjc
120
+ SRCS = analysis.c api.c array.c bitvector.c compound_io.c document.c except.c ferret.c filter.c fs_store.c global.c hash.c hashset.c helper.c index.c libstemmer.c mempool.c multimapper.c posh.c priorityqueue.c q_boolean.c q_const_score.c q_filtered_query.c q_fuzzy.c q_match_all.c q_multi_term.c q_parser.c q_phrase.c q_prefix.c q_range.c q_span.c q_term.c q_wildcard.c r_analysis.c r_index.c r_qparser.c r_search.c r_store.c r_utils.c ram_store.c search.c similarity.c sort.c stem_ISO_8859_1_danish.c stem_ISO_8859_1_dutch.c stem_ISO_8859_1_english.c stem_ISO_8859_1_finnish.c stem_ISO_8859_1_french.c stem_ISO_8859_1_german.c stem_ISO_8859_1_italian.c stem_ISO_8859_1_norwegian.c stem_ISO_8859_1_porter.c stem_ISO_8859_1_portuguese.c stem_ISO_8859_1_spanish.c stem_ISO_8859_1_swedish.c stem_KOI8_R_russian.c stem_UTF_8_danish.c stem_UTF_8_dutch.c stem_UTF_8_english.c stem_UTF_8_finnish.c stem_UTF_8_french.c stem_UTF_8_german.c stem_UTF_8_italian.c stem_UTF_8_norwegian.c stem_UTF_8_porter.c stem_UTF_8_portuguese.c stem_UTF_8_russian.c stem_UTF_8_spanish.c stem_UTF_8_swedish.c stopwords.c store.c term_vectors.c utilities.c
121
+ OBJS = analysis.o api.o array.o bitvector.o compound_io.o document.o except.o ferret.o filter.o fs_store.o global.o hash.o hashset.o helper.o index.o libstemmer.o mempool.o multimapper.o posh.o priorityqueue.o q_boolean.o q_const_score.o q_filtered_query.o q_fuzzy.o q_match_all.o q_multi_term.o q_parser.o q_phrase.o q_prefix.o q_range.o q_span.o q_term.o q_wildcard.o r_analysis.o r_index.o r_qparser.o r_search.o r_store.o r_utils.o ram_store.o search.o similarity.o sort.o stem_ISO_8859_1_danish.o stem_ISO_8859_1_dutch.o stem_ISO_8859_1_english.o stem_ISO_8859_1_finnish.o stem_ISO_8859_1_french.o stem_ISO_8859_1_german.o stem_ISO_8859_1_italian.o stem_ISO_8859_1_norwegian.o stem_ISO_8859_1_porter.o stem_ISO_8859_1_portuguese.o stem_ISO_8859_1_spanish.o stem_ISO_8859_1_swedish.o stem_KOI8_R_russian.o stem_UTF_8_danish.o stem_UTF_8_dutch.o stem_UTF_8_english.o stem_UTF_8_finnish.o stem_UTF_8_french.o stem_UTF_8_german.o stem_UTF_8_italian.o stem_UTF_8_norwegian.o stem_UTF_8_porter.o stem_UTF_8_portuguese.o stem_UTF_8_russian.o stem_UTF_8_spanish.o stem_UTF_8_swedish.o stopwords.o store.o term_vectors.o utilities.o
122
+ TARGET = ferret_ext
123
+ DLLIB = $(TARGET).bundle
124
+ EXTSTATIC =
125
+ STATIC_LIB =
126
+
127
+ BINDIR = $(bindir)
128
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
129
+ RUBYLIBDIR = /Users/ehanson/.rbenv/versions/1.9.3-p392/gemsets/ferret/gems/sdsykes-ferret-0.11.6.19/lib$(target_prefix)
130
+ RUBYARCHDIR = /Users/ehanson/.rbenv/versions/1.9.3-p392/gemsets/ferret/gems/sdsykes-ferret-0.11.6.19/lib$(target_prefix)
131
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
132
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
133
+
134
+ TARGET_SO = $(DLLIB)
135
+ CLEANLIBS = $(TARGET).bundle
136
+ CLEANOBJS = *.o *.bak
137
+
138
+ all: $(DLLIB)
139
+ static: $(STATIC_LIB)
140
+ .PHONY: all install static install-so install-rb
141
+ .PHONY: clean clean-so clean-rb
142
+
143
+ clean-static::
144
+ clean-rb-default::
145
+ clean-rb::
146
+ clean-so::
147
+ clean: clean-so clean-static clean-rb-default clean-rb
148
+ -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
149
+
150
+ distclean-rb-default::
151
+ distclean-rb::
152
+ distclean-so::
153
+ distclean: clean distclean-so distclean-rb-default distclean-rb
154
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
155
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
156
+ @-$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
157
+
158
+ realclean: distclean
159
+ install: install-so install-rb
160
+
161
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
162
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
163
+ -$(Q)$(MAKEDIRS) $(@D)
164
+ $(INSTALL_PROG) $(DLLIB) $(@D)
165
+ clean-static::
166
+ -$(Q)$(RM) $(STATIC_LIB)
167
+ install-rb: pre-install-rb install-rb-default
168
+ install-rb-default: pre-install-rb-default
169
+ pre-install-rb: Makefile
170
+ pre-install-rb-default: Makefile
171
+ pre-install-rb-default:
172
+ $(ECHO) installing default ferret_ext libraries
173
+ ./.RUBYARCHDIR.time:
174
+ $(Q) $(MAKEDIRS) $(RUBYARCHDIR)
175
+ $(Q) $(TOUCH) $@
176
+
177
+ site-install: site-install-so site-install-rb
178
+ site-install-so: install-so
179
+ site-install-rb: install-rb
180
+
181
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
182
+
183
+ .cc.o:
184
+ $(ECHO) compiling $(<)
185
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
186
+
187
+ .mm.o:
188
+ $(ECHO) compiling $(<)
189
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
190
+
191
+ .cxx.o:
192
+ $(ECHO) compiling $(<)
193
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
194
+
195
+ .cpp.o:
196
+ $(ECHO) compiling $(<)
197
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
198
+
199
+ .C.o:
200
+ $(ECHO) compiling $(<)
201
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
202
+
203
+ .c.o:
204
+ $(ECHO) compiling $(<)
205
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
206
+
207
+ .m.o:
208
+ $(ECHO) compiling $(<)
209
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
210
+
211
+ $(DLLIB): $(OBJS) Makefile
212
+ $(ECHO) linking shared-object $(DLLIB)
213
+ -$(Q)$(RM) $(@)
214
+ $(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
215
+
216
+
217
+
218
+ $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h