np-ferret 0.11.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +24 -0
- data/MIT-LICENSE +20 -0
- data/README +102 -0
- data/Rakefile +338 -0
- data/TODO +17 -0
- data/TUTORIAL +231 -0
- data/bin/ferret-browser +79 -0
- data/ext/Makefile +218 -0
- data/ext/analysis.c +1584 -0
- data/ext/analysis.h +219 -0
- data/ext/analysis.o +0 -0
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/api.o +0 -0
- data/ext/array.c +123 -0
- data/ext/array.h +53 -0
- data/ext/array.o +0 -0
- data/ext/bitvector.c +540 -0
- data/ext/bitvector.h +272 -0
- data/ext/bitvector.o +0 -0
- data/ext/compound_io.c +383 -0
- data/ext/compound_io.o +0 -0
- data/ext/config.h +42 -0
- data/ext/document.c +156 -0
- data/ext/document.h +53 -0
- data/ext/document.o +0 -0
- data/ext/except.c +120 -0
- data/ext/except.h +168 -0
- data/ext/except.o +0 -0
- data/ext/extconf.rb +14 -0
- data/ext/ferret.c +402 -0
- data/ext/ferret.h +91 -0
- data/ext/ferret.o +0 -0
- data/ext/ferret_ext.bundle +0 -0
- data/ext/filter.c +156 -0
- data/ext/filter.o +0 -0
- data/ext/fs_store.c +484 -0
- data/ext/fs_store.o +0 -0
- data/ext/global.c +418 -0
- data/ext/global.h +117 -0
- data/ext/global.o +0 -0
- data/ext/hash.c +598 -0
- data/ext/hash.h +475 -0
- data/ext/hash.o +0 -0
- data/ext/hashset.c +170 -0
- data/ext/hashset.h +187 -0
- data/ext/hashset.o +0 -0
- data/ext/header.h +58 -0
- data/ext/helper.c +62 -0
- data/ext/helper.h +13 -0
- data/ext/helper.o +0 -0
- data/ext/inc/lang.h +48 -0
- data/ext/inc/threading.h +31 -0
- data/ext/index.c +6510 -0
- data/ext/index.h +964 -0
- data/ext/index.o +0 -0
- data/ext/lang.h +66 -0
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/libstemmer.o +0 -0
- data/ext/mempool.c +87 -0
- data/ext/mempool.h +35 -0
- data/ext/mempool.o +0 -0
- data/ext/modules.h +162 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/multimapper.o +0 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/posh.o +0 -0
- data/ext/priorityqueue.c +151 -0
- data/ext/priorityqueue.h +143 -0
- data/ext/priorityqueue.o +0 -0
- data/ext/q_boolean.c +1608 -0
- data/ext/q_boolean.o +0 -0
- data/ext/q_const_score.c +165 -0
- data/ext/q_const_score.o +0 -0
- data/ext/q_filtered_query.c +209 -0
- data/ext/q_filtered_query.o +0 -0
- data/ext/q_fuzzy.c +335 -0
- data/ext/q_fuzzy.o +0 -0
- data/ext/q_match_all.c +148 -0
- data/ext/q_match_all.o +0 -0
- data/ext/q_multi_term.c +677 -0
- data/ext/q_multi_term.o +0 -0
- data/ext/q_parser.c +2825 -0
- data/ext/q_parser.o +0 -0
- data/ext/q_phrase.c +1126 -0
- data/ext/q_phrase.o +0 -0
- data/ext/q_prefix.c +100 -0
- data/ext/q_prefix.o +0 -0
- data/ext/q_range.c +356 -0
- data/ext/q_range.o +0 -0
- data/ext/q_span.c +2402 -0
- data/ext/q_span.o +0 -0
- data/ext/q_term.c +337 -0
- data/ext/q_term.o +0 -0
- data/ext/q_wildcard.c +171 -0
- data/ext/q_wildcard.o +0 -0
- data/ext/r_analysis.c +2636 -0
- data/ext/r_analysis.o +0 -0
- data/ext/r_index.c +3509 -0
- data/ext/r_index.o +0 -0
- data/ext/r_qparser.c +585 -0
- data/ext/r_qparser.o +0 -0
- data/ext/r_search.c +4240 -0
- data/ext/r_search.o +0 -0
- data/ext/r_store.c +513 -0
- data/ext/r_store.o +0 -0
- data/ext/r_utils.c +963 -0
- data/ext/r_utils.o +0 -0
- data/ext/ram_store.c +471 -0
- data/ext/ram_store.o +0 -0
- data/ext/search.c +1743 -0
- data/ext/search.h +885 -0
- data/ext/search.o +0 -0
- data/ext/similarity.c +150 -0
- data/ext/similarity.h +82 -0
- data/ext/similarity.o +0 -0
- data/ext/sort.c +985 -0
- data/ext/sort.o +0 -0
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_danish.o +0 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.o +0 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_english.o +0 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.o +0 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_french.o +0 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_german.o +0 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_italian.o +0 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.o +0 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_porter.o +0 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.o +0 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.o +0 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.o +0 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_KOI8_R_russian.o +0 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_danish.o +0 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_dutch.o +0 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_english.o +0 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_finnish.o +0 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_french.o +0 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_german.o +0 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_italian.o +0 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_norwegian.o +0 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_porter.o +0 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_portuguese.o +0 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_russian.o +0 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_spanish.o +0 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stem_UTF_8_swedish.o +0 -0
- data/ext/stopwords.c +401 -0
- data/ext/stopwords.o +0 -0
- data/ext/store.c +692 -0
- data/ext/store.h +777 -0
- data/ext/store.o +0 -0
- data/ext/term_vectors.c +352 -0
- data/ext/term_vectors.o +0 -0
- data/ext/threading.h +31 -0
- data/ext/utilities.c +446 -0
- data/ext/utilities.o +0 -0
- data/ext/win32.h +54 -0
- data/ferret.gemspec +39 -0
- data/lib/ferret.rb +29 -0
- data/lib/ferret/browser.rb +246 -0
- data/lib/ferret/browser/s/global.js +192 -0
- data/lib/ferret/browser/s/style.css +148 -0
- data/lib/ferret/browser/views/document/list.rhtml +49 -0
- data/lib/ferret/browser/views/document/show.rhtml +27 -0
- data/lib/ferret/browser/views/error/index.rhtml +7 -0
- data/lib/ferret/browser/views/help/index.rhtml +8 -0
- data/lib/ferret/browser/views/home/index.rhtml +29 -0
- data/lib/ferret/browser/views/layout.rhtml +22 -0
- data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
- data/lib/ferret/browser/views/term/index.rhtml +199 -0
- data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
- data/lib/ferret/browser/webrick.rb +14 -0
- data/lib/ferret/document.rb +130 -0
- data/lib/ferret/field_infos.rb +44 -0
- data/lib/ferret/index.rb +786 -0
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_ext.bundle +0 -0
- data/lib/ferret_version.rb +3 -0
- data/pkg/ferret-0.11.6.gem +0 -0
- data/pkg/ferret-0.11.6.tgz +0 -0
- data/pkg/ferret-0.11.6.zip +0 -0
- data/setup.rb +1555 -0
- data/test/test_all.rb +5 -0
- data/test/test_helper.rb +24 -0
- data/test/threading/number_to_spoken.rb +132 -0
- data/test/threading/thread_safety_index_test.rb +79 -0
- data/test/threading/thread_safety_read_write_test.rb +76 -0
- data/test/threading/thread_safety_test.rb +133 -0
- data/test/unit/analysis/tc_analyzer.rb +548 -0
- data/test/unit/analysis/tc_token_stream.rb +646 -0
- data/test/unit/index/tc_index.rb +762 -0
- data/test/unit/index/tc_index_reader.rb +699 -0
- data/test/unit/index/tc_index_writer.rb +437 -0
- data/test/unit/index/th_doc.rb +315 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +238 -0
- data/test/unit/search/tc_filter.rb +135 -0
- data/test/unit/search/tc_fuzzy_query.rb +147 -0
- data/test/unit/search/tc_index_searcher.rb +61 -0
- data/test/unit/search/tc_multi_searcher.rb +128 -0
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tc_search_and_sort.rb +179 -0
- data/test/unit/search/tc_sort.rb +49 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +190 -0
- data/test/unit/search/tm_searcher.rb +384 -0
- data/test/unit/store/tc_fs_store.rb +77 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +34 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +2 -0
- data/test/unit/ts_index.rb +2 -0
- data/test/unit/ts_largefile.rb +4 -0
- data/test/unit/ts_query_parser.rb +2 -0
- data/test/unit/ts_search.rb +2 -0
- data/test/unit/ts_store.rb +2 -0
- data/test/unit/ts_utils.rb +2 -0
- data/test/unit/utils/tc_bit_vector.rb +295 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +392 -0
data/TODO
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
= TODO
|
2
|
+
|
3
|
+
* user defined sorting
|
4
|
+
* add field compression
|
5
|
+
* Fix highlighting to work for compressed fields
|
6
|
+
* Fix highlighting to work for external fields
|
7
|
+
* Add Ferret::Index::Index
|
8
|
+
* Fix:
|
9
|
+
> Working Query: field1:value1 AND NOT field2:value2
|
10
|
+
> Failing Query: field1:value1 AND ( NOT field2:value2 )
|
11
|
+
|
12
|
+
= Done
|
13
|
+
* Add string Sort descripter
|
14
|
+
* fix memory bug
|
15
|
+
* add MultiReader interface
|
16
|
+
* add lexicographical sort (byte sort)
|
17
|
+
* Add highlighting
|
data/TUTORIAL
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
= Quick Introduction to Ferret
|
2
|
+
|
3
|
+
The simplest way to use Ferret is through the Ferret::Index::Index class.
|
4
|
+
This is now aliased by Ferret::I for quick and easy access. Start by including
|
5
|
+
the Ferret module.
|
6
|
+
|
7
|
+
require 'ferret'
|
8
|
+
include Ferret
|
9
|
+
|
10
|
+
=== Creating an index
|
11
|
+
|
12
|
+
To create an in memory index is very simple;
|
13
|
+
|
14
|
+
index = Index::Index.new()
|
15
|
+
|
16
|
+
To create a persistent index;
|
17
|
+
|
18
|
+
index = Index::Index.new(:path => '/path/to/index')
|
19
|
+
|
20
|
+
Both of these methods create new Indexes with the StandardAnalyzer. An
|
21
|
+
analyzer is what you use to divide the input data up into tokens which you can
|
22
|
+
search for later. If you'd like to use a different analyzer you can specify it
|
23
|
+
here, eg;
|
24
|
+
|
25
|
+
index = Index::Index.new(:path => '/path/to/index',
|
26
|
+
:analyzer => Analysis::WhiteSpaceAnalyzer.new)
|
27
|
+
|
28
|
+
For more options when creating an Index refer to Ferret::Index::Index.
|
29
|
+
|
30
|
+
=== Adding Documents
|
31
|
+
|
32
|
+
To add a document you can simply add a string or an array of strings. This will
|
33
|
+
store all the strings in the "" (ie empty string) field (unless you specify the
|
34
|
+
default field when you create the index).
|
35
|
+
|
36
|
+
index << "This is a new document to be indexed"
|
37
|
+
index << ["And here", "is another", "new document", "to be indexed"]
|
38
|
+
|
39
|
+
But these are pretty simple documents. If this is all you want to index you
|
40
|
+
could probably just use SimpleSearch. So let's give our documents some fields;
|
41
|
+
|
42
|
+
index << {:title => "Programming Ruby", :content => "blah blah blah"}
|
43
|
+
index << {:title => "Programming Ruby", :content => "yada yada yada"}
|
44
|
+
|
45
|
+
Note the way that all field-names are Symbols. Although Strings will work,
|
46
|
+
this is a best-practice in Ferret. Or if you are indexing data stored in a
|
47
|
+
database, you'll probably want to store the id;
|
48
|
+
|
49
|
+
index << {:id => row.id, :title => row.title, :date => row.date}
|
50
|
+
|
51
|
+
So far we have been storing and tokenizing all of the input data along with
|
52
|
+
term vectors. If we want to change this we need to change the way we setup the
|
53
|
+
index. You must create a FieldInfos object describing the index:
|
54
|
+
|
55
|
+
field_infos = FieldInfos.new(:store => :no,
|
56
|
+
:index => :untokenized_omit_norms,
|
57
|
+
:term_vector => :no)
|
58
|
+
|
59
|
+
The values that you set FieldInfos to have will be used by default by all
|
60
|
+
fields. If you want to change the properties for specific fields, you need to
|
61
|
+
add a FieldInfo to field_infos.
|
62
|
+
|
63
|
+
field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
|
64
|
+
field_infos.add_field(:content, :store => :yes,
|
65
|
+
:index => :yes,
|
66
|
+
:term_vector => :with_positions_offsets)
|
67
|
+
|
68
|
+
If you need to add a field to an already open index you do so like this:
|
69
|
+
|
70
|
+
index.field_infos.add_field(:new_field, :store => :yes)
|
71
|
+
|
72
|
+
=== Searching
|
73
|
+
|
74
|
+
Now that we have data in our index, how do we actually use this index to
|
75
|
+
search the data? The Index offers two search methods, Index#search and
|
76
|
+
Index#search_each. The first method returns a Ferret::Index::TopDocs object.
|
77
|
+
The second we'll show here. Lets say we wanted to find all documents with the
|
78
|
+
phrase "quick brown fox" in the content field. We'd write;
|
79
|
+
|
80
|
+
index.search_each('content:"quick brown fox"') do |id, score|
|
81
|
+
puts "Document #{id} found with a score of #{score}"
|
82
|
+
end
|
83
|
+
|
84
|
+
But "fast" has a pretty similar meaning to "quick" and we don't mind if the
|
85
|
+
fox is a little red. Also, the phrase could be in the title so we'll search
|
86
|
+
there as well. So we could expand our search like this;
|
87
|
+
|
88
|
+
index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
|
89
|
+
puts "Document #{id} found with a score of #{score}"
|
90
|
+
end
|
91
|
+
|
92
|
+
What if we want to find all documents entered on or after 5th of September,
|
93
|
+
2005 with the words "ruby" or "rails" in any field. We could type something like;
|
94
|
+
|
95
|
+
index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
|
96
|
+
puts "Document #{index[id][:title]} found with a score of #{score}"
|
97
|
+
end
|
98
|
+
|
99
|
+
Ferret has quite a complex query language. To find out more about Ferret's
|
100
|
+
query language, see Ferret::QueryParser. You can also construct even more
|
101
|
+
complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
|
102
|
+
for more information.
|
103
|
+
|
104
|
+
=== Highlighting
|
105
|
+
|
106
|
+
Ferret now has a super-fast highlighting method. See
|
107
|
+
Ferret::Index::Index#highlight. Here is an example of how you would use it
|
108
|
+
when printing to the console:
|
109
|
+
|
110
|
+
index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
|
111
|
+
puts "Document #{index[id][:title]} found with a score of #{score}"
|
112
|
+
highlights = index.highlight("content:(ruby OR rails)", 0,
|
113
|
+
:field => :content,
|
114
|
+
:pre_tag = "\033[36m",
|
115
|
+
:post_tag = "\033[m")
|
116
|
+
puts highlights
|
117
|
+
end
|
118
|
+
|
119
|
+
And if you want to highlight a whole document, set :excerpt_length to :all:
|
120
|
+
|
121
|
+
puts index.highlight(query, doc_id,
|
122
|
+
:field => :content,
|
123
|
+
:pre_tag = "\033[36m",
|
124
|
+
:post_tag = "\033[m",
|
125
|
+
:excerpt_length => :all)
|
126
|
+
|
127
|
+
=== Accessing Documents
|
128
|
+
|
129
|
+
You may have noticed that when we run a search we only get the document id
|
130
|
+
back. By itself this isn't much use to us. Getting the data from the index is
|
131
|
+
very straightforward. For example if we want the :title field form the 3rd
|
132
|
+
document type;
|
133
|
+
|
134
|
+
index[2][:title]
|
135
|
+
|
136
|
+
Documents are lazy loading so if you try this:
|
137
|
+
|
138
|
+
puts index[2]
|
139
|
+
|
140
|
+
You will always get an empty Hash. To load all fields, call the load method:
|
141
|
+
|
142
|
+
puts index[2].load
|
143
|
+
|
144
|
+
NOTE: documents are indexed from 0. You can also use array-like index
|
145
|
+
parameters to access index. For example
|
146
|
+
|
147
|
+
index[1..4]
|
148
|
+
index[10, 10]
|
149
|
+
index[-5]
|
150
|
+
|
151
|
+
The default field is :id (although you can change this with index's
|
152
|
+
:default_create_field parameter);
|
153
|
+
|
154
|
+
index << "This is a document"
|
155
|
+
index[0][:id]
|
156
|
+
|
157
|
+
Let's go back to the database example above. If we store all of our documents
|
158
|
+
with an id then we can access that field using the id. As long as we called
|
159
|
+
our id field :id we can do this
|
160
|
+
|
161
|
+
index["89721347"]["title"]
|
162
|
+
|
163
|
+
Pretty simple huh? You should note though that if there are more then one
|
164
|
+
document with the same *id* or *key* then only the first one will be returned
|
165
|
+
so it is probably better that you ensure the key is unique somehow. By setting
|
166
|
+
Index's :key attribute to :id, Ferret will do this automatically for you. It
|
167
|
+
can even handle multiple field primary keys. For example, you could set to
|
168
|
+
:key to [:id, :model] and Ferret would keep the documents unique for that pair
|
169
|
+
of fields.
|
170
|
+
|
171
|
+
=== Modifying and Deleting Documents
|
172
|
+
|
173
|
+
What if we want to change the data in the index. Ferret doesn't actually let
|
174
|
+
you change the data once it is in the index. But you can delete documents so
|
175
|
+
the standard way to modify data is to delete it and re-add it again with the
|
176
|
+
modifications made. It is important to note that when doing this the documents
|
177
|
+
will get a new document number so you should be careful not to use a document
|
178
|
+
number after the document has been deleted. Here is an example of modifying a
|
179
|
+
document;
|
180
|
+
|
181
|
+
index << {:title => "Programing Rbuy", :content => "blah blah blah"}
|
182
|
+
doc_num = nil
|
183
|
+
index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
|
184
|
+
return unless doc_id
|
185
|
+
doc = index[doc_id]
|
186
|
+
index.delete(doc_id)
|
187
|
+
|
188
|
+
# modify doc. It is just a Hash after all
|
189
|
+
doc[:title] = "Programming Ruby"
|
190
|
+
|
191
|
+
index << doc
|
192
|
+
|
193
|
+
If you set the :key parameter as described in the last section there is no
|
194
|
+
need to delete the document. It will be automatically deleted when you add
|
195
|
+
another document with the same key.
|
196
|
+
|
197
|
+
Also, we can use the id field, as above, to delete documents. This time though
|
198
|
+
every document that matches the id will be deleted. Again, it is probably a
|
199
|
+
good idea if you somehow ensure that your *ids* are kept unique.
|
200
|
+
|
201
|
+
id = "23453422"
|
202
|
+
index.delete(id)
|
203
|
+
|
204
|
+
=== Onwards
|
205
|
+
|
206
|
+
This is just a small sampling of what Ferret allows you to do. Ferret, like
|
207
|
+
Lucene, is designed to be extended, and allows you to construct your own query
|
208
|
+
types, analyzers, and so on. Going onwards you should check out the following
|
209
|
+
documentation:
|
210
|
+
|
211
|
+
* Ferret::Analysis: for more information on how the data is processed when it
|
212
|
+
is tokenized. There are a number of things you can do with your data such as
|
213
|
+
adding stop lists or perhaps a porter stemmer. There are also a number of
|
214
|
+
analyzers already available and it is almost trivial to create a new one
|
215
|
+
with a simple regular expression.
|
216
|
+
|
217
|
+
* Ferret::Search: for more information on querying the index. There are a
|
218
|
+
number of already available queries and it's unlikely you'll need to create
|
219
|
+
your own. You may however want to take advantage of the sorting or filtering
|
220
|
+
abilities of Ferret to present your data the best way you see fit.
|
221
|
+
|
222
|
+
* Ferret::QueryParser: if you want to find out more about what you can do with
|
223
|
+
Ferret's Query Parser, this is the place to look. The query parser is one
|
224
|
+
area that could use a bit of work so please send your suggestions.
|
225
|
+
|
226
|
+
* Ferret::Index: for more advanced access to the index you'll probably want to
|
227
|
+
use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
|
228
|
+
the place to look for more information on them.
|
229
|
+
|
230
|
+
* Ferret::Store: This is the module used to access the actual index storage
|
231
|
+
and won't be of much interest to most people.
|
data/bin/ferret-browser
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
|
4
|
+
require 'ferret'
|
5
|
+
require 'ferret/browser'
|
6
|
+
|
7
|
+
require 'optparse'
|
8
|
+
require 'ostruct'
|
9
|
+
|
10
|
+
SERVER_OPTIONS = ['webrick']
|
11
|
+
conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
|
12
|
+
|
13
|
+
opts = OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{File.basename($0)} /path/to/index"
|
15
|
+
opts.separator ""
|
16
|
+
opts.separator "Specific Options:"
|
17
|
+
|
18
|
+
opts.on("-h", "--host HOSTNAME",
|
19
|
+
"Host for web server to bind to (default is all IPs)") { |conf.host| }
|
20
|
+
opts.on("-p", "--port NUM",
|
21
|
+
"Port for web server (defaults to #{conf.port})") { |conf.port| }
|
22
|
+
opts.on("-s", "--server NAME",
|
23
|
+
"Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
|
24
|
+
|
25
|
+
opts.separator ""
|
26
|
+
opts.separator "Common options:"
|
27
|
+
|
28
|
+
opts.on_tail("-?", "--help", "Show this message") do
|
29
|
+
puts opts
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on_tail("-v", "--version", "Show version") do
|
34
|
+
puts Ferret::VERSION
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.parse! ARGV
|
40
|
+
if ARGV.length != 1
|
41
|
+
puts opts
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
@path = ARGV[0]
|
45
|
+
|
46
|
+
# Load the Ferret index
|
47
|
+
begin
|
48
|
+
@reader = Ferret::Index::IndexReader.new(@path)
|
49
|
+
rescue Ferret::FileNotFoundError => e
|
50
|
+
puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
|
51
|
+
"\033[33m#{@path}\033[m\033[31m\".\033[m"
|
52
|
+
exit
|
53
|
+
rescue Exception => e
|
54
|
+
puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
|
58
|
+
unless conf.server
|
59
|
+
conf.server = :webrick
|
60
|
+
end
|
61
|
+
|
62
|
+
case conf.server.to_s
|
63
|
+
when 'webrick'
|
64
|
+
require 'webrick/httpserver'
|
65
|
+
require 'ferret/browser/webrick'
|
66
|
+
|
67
|
+
# Mount the root
|
68
|
+
s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
|
69
|
+
s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
|
70
|
+
s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
|
71
|
+
|
72
|
+
# Server up
|
73
|
+
trap(:INT) do
|
74
|
+
s.shutdown
|
75
|
+
end
|
76
|
+
s.start
|
77
|
+
else
|
78
|
+
raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
|
79
|
+
end
|
data/ext/Makefile
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
n=$(NULLCMD)
|
9
|
+
ECHO1 = $(V:1=@$n)
|
10
|
+
ECHO = $(ECHO1:0=@echo)
|
11
|
+
|
12
|
+
#### Start of system configuration section. ####
|
13
|
+
|
14
|
+
srcdir = .
|
15
|
+
topdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1
|
16
|
+
hdrdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1
|
17
|
+
arch_hdrdir = /Users/ehanson/.rbenv/versions/1.9.3-p392/include/ruby-1.9.1/$(arch)
|
18
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
19
|
+
prefix = $(DESTDIR)/Users/ehanson/.rbenv/versions/1.9.3-p392
|
20
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
21
|
+
exec_prefix = $(prefix)
|
22
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
23
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
24
|
+
rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
|
25
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
26
|
+
sitedir = $(rubylibprefix)/site_ruby
|
27
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
28
|
+
mandir = $(datarootdir)/man
|
29
|
+
localedir = $(datarootdir)/locale
|
30
|
+
libdir = $(exec_prefix)/lib
|
31
|
+
psdir = $(docdir)
|
32
|
+
pdfdir = $(docdir)
|
33
|
+
dvidir = $(docdir)
|
34
|
+
htmldir = $(docdir)
|
35
|
+
infodir = $(datarootdir)/info
|
36
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
37
|
+
oldincludedir = $(DESTDIR)/usr/include
|
38
|
+
includedir = $(prefix)/include
|
39
|
+
localstatedir = $(prefix)/var
|
40
|
+
sharedstatedir = $(prefix)/com
|
41
|
+
sysconfdir = $(prefix)/etc
|
42
|
+
datadir = $(datarootdir)
|
43
|
+
datarootdir = $(prefix)/share
|
44
|
+
libexecdir = $(exec_prefix)/libexec
|
45
|
+
sbindir = $(exec_prefix)/sbin
|
46
|
+
bindir = $(exec_prefix)/bin
|
47
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
48
|
+
archdir = $(rubylibdir)/$(arch)
|
49
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
50
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
51
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
52
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
53
|
+
|
54
|
+
NULLCMD = :
|
55
|
+
|
56
|
+
CC = gcc
|
57
|
+
CXX = g++
|
58
|
+
LIBRUBY = $(LIBRUBY_A)
|
59
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
60
|
+
LIBRUBYARG_SHARED =
|
61
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
|
62
|
+
empty =
|
63
|
+
OUTFLAG = -o $(empty)
|
64
|
+
COUTFLAG = -o $(empty)
|
65
|
+
|
66
|
+
RUBY_EXTCONF_H =
|
67
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
68
|
+
optflags = -O3
|
69
|
+
debugflags = -ggdb
|
70
|
+
warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration
|
71
|
+
CFLAGS = -fno-common -O3 -Wno-error=shorten-64-to-32 -pipe -D_FILE_OFFSET_BITS=64 $(ARCH_FLAG)
|
72
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
73
|
+
DEFS =
|
74
|
+
CPPFLAGS = -I'/Users/ehanson/.rbenv/versions/1.9.3-p392/include' -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
|
75
|
+
CXXFLAGS = $(CFLAGS) $(cxxflags)
|
76
|
+
ldflags = -L. -L'/Users/ehanson/.rbenv/versions/1.9.3-p392/lib' -L/usr/local/lib
|
77
|
+
dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
|
78
|
+
ARCH_FLAG =
|
79
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
80
|
+
LDSHARED = $(CC) -dynamic -bundle
|
81
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
82
|
+
AR = ar
|
83
|
+
EXEEXT =
|
84
|
+
|
85
|
+
RUBY_BASE_NAME = ruby
|
86
|
+
RUBY_INSTALL_NAME = ruby
|
87
|
+
RUBY_SO_NAME = ruby
|
88
|
+
arch = x86_64-darwin12.2.1
|
89
|
+
sitearch = $(arch)
|
90
|
+
ruby_version = 1.9.1
|
91
|
+
ruby = /Users/ehanson/.rbenv/versions/1.9.3-p392/bin/ruby
|
92
|
+
RUBY = $(ruby)
|
93
|
+
RM = rm -f
|
94
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
95
|
+
RMDIRS = rmdir -p
|
96
|
+
MAKEDIRS = mkdir -p
|
97
|
+
INSTALL = /usr/bin/install -c
|
98
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
99
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
100
|
+
COPY = cp
|
101
|
+
TOUCH = exit >
|
102
|
+
|
103
|
+
#### End of system configuration section. ####
|
104
|
+
|
105
|
+
preload =
|
106
|
+
|
107
|
+
libpath = . $(libdir)
|
108
|
+
LIBPATH = -L. -L$(libdir)
|
109
|
+
DEFFILE =
|
110
|
+
|
111
|
+
CLEANFILES = mkmf.log
|
112
|
+
DISTCLEANFILES =
|
113
|
+
DISTCLEANDIRS =
|
114
|
+
|
115
|
+
extout =
|
116
|
+
extout_prefix =
|
117
|
+
target_prefix =
|
118
|
+
LOCAL_LIBS =
|
119
|
+
LIBS = -lpthread -ldl -lobjc
|
120
|
+
SRCS = analysis.c api.c array.c bitvector.c compound_io.c document.c except.c ferret.c filter.c fs_store.c global.c hash.c hashset.c helper.c index.c libstemmer.c mempool.c multimapper.c posh.c priorityqueue.c q_boolean.c q_const_score.c q_filtered_query.c q_fuzzy.c q_match_all.c q_multi_term.c q_parser.c q_phrase.c q_prefix.c q_range.c q_span.c q_term.c q_wildcard.c r_analysis.c r_index.c r_qparser.c r_search.c r_store.c r_utils.c ram_store.c search.c similarity.c sort.c stem_ISO_8859_1_danish.c stem_ISO_8859_1_dutch.c stem_ISO_8859_1_english.c stem_ISO_8859_1_finnish.c stem_ISO_8859_1_french.c stem_ISO_8859_1_german.c stem_ISO_8859_1_italian.c stem_ISO_8859_1_norwegian.c stem_ISO_8859_1_porter.c stem_ISO_8859_1_portuguese.c stem_ISO_8859_1_spanish.c stem_ISO_8859_1_swedish.c stem_KOI8_R_russian.c stem_UTF_8_danish.c stem_UTF_8_dutch.c stem_UTF_8_english.c stem_UTF_8_finnish.c stem_UTF_8_french.c stem_UTF_8_german.c stem_UTF_8_italian.c stem_UTF_8_norwegian.c stem_UTF_8_porter.c stem_UTF_8_portuguese.c stem_UTF_8_russian.c stem_UTF_8_spanish.c stem_UTF_8_swedish.c stopwords.c store.c term_vectors.c utilities.c
|
121
|
+
OBJS = analysis.o api.o array.o bitvector.o compound_io.o document.o except.o ferret.o filter.o fs_store.o global.o hash.o hashset.o helper.o index.o libstemmer.o mempool.o multimapper.o posh.o priorityqueue.o q_boolean.o q_const_score.o q_filtered_query.o q_fuzzy.o q_match_all.o q_multi_term.o q_parser.o q_phrase.o q_prefix.o q_range.o q_span.o q_term.o q_wildcard.o r_analysis.o r_index.o r_qparser.o r_search.o r_store.o r_utils.o ram_store.o search.o similarity.o sort.o stem_ISO_8859_1_danish.o stem_ISO_8859_1_dutch.o stem_ISO_8859_1_english.o stem_ISO_8859_1_finnish.o stem_ISO_8859_1_french.o stem_ISO_8859_1_german.o stem_ISO_8859_1_italian.o stem_ISO_8859_1_norwegian.o stem_ISO_8859_1_porter.o stem_ISO_8859_1_portuguese.o stem_ISO_8859_1_spanish.o stem_ISO_8859_1_swedish.o stem_KOI8_R_russian.o stem_UTF_8_danish.o stem_UTF_8_dutch.o stem_UTF_8_english.o stem_UTF_8_finnish.o stem_UTF_8_french.o stem_UTF_8_german.o stem_UTF_8_italian.o stem_UTF_8_norwegian.o stem_UTF_8_porter.o stem_UTF_8_portuguese.o stem_UTF_8_russian.o stem_UTF_8_spanish.o stem_UTF_8_swedish.o stopwords.o store.o term_vectors.o utilities.o
|
122
|
+
TARGET = ferret_ext
|
123
|
+
DLLIB = $(TARGET).bundle
|
124
|
+
EXTSTATIC =
|
125
|
+
STATIC_LIB =
|
126
|
+
|
127
|
+
BINDIR = $(bindir)
|
128
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
129
|
+
RUBYLIBDIR = /Users/ehanson/.rbenv/versions/1.9.3-p392/gemsets/ferret/gems/sdsykes-ferret-0.11.6.19/lib$(target_prefix)
|
130
|
+
RUBYARCHDIR = /Users/ehanson/.rbenv/versions/1.9.3-p392/gemsets/ferret/gems/sdsykes-ferret-0.11.6.19/lib$(target_prefix)
|
131
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
132
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
133
|
+
|
134
|
+
TARGET_SO = $(DLLIB)
|
135
|
+
CLEANLIBS = $(TARGET).bundle
|
136
|
+
CLEANOBJS = *.o *.bak
|
137
|
+
|
138
|
+
all: $(DLLIB)
|
139
|
+
static: $(STATIC_LIB)
|
140
|
+
.PHONY: all install static install-so install-rb
|
141
|
+
.PHONY: clean clean-so clean-rb
|
142
|
+
|
143
|
+
clean-static::
|
144
|
+
clean-rb-default::
|
145
|
+
clean-rb::
|
146
|
+
clean-so::
|
147
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
148
|
+
-$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
149
|
+
|
150
|
+
distclean-rb-default::
|
151
|
+
distclean-rb::
|
152
|
+
distclean-so::
|
153
|
+
distclean: clean distclean-so distclean-rb-default distclean-rb
|
154
|
+
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
155
|
+
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
156
|
+
@-$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
157
|
+
|
158
|
+
realclean: distclean
|
159
|
+
install: install-so install-rb
|
160
|
+
|
161
|
+
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
162
|
+
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
163
|
+
-$(Q)$(MAKEDIRS) $(@D)
|
164
|
+
$(INSTALL_PROG) $(DLLIB) $(@D)
|
165
|
+
clean-static::
|
166
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
167
|
+
install-rb: pre-install-rb install-rb-default
|
168
|
+
install-rb-default: pre-install-rb-default
|
169
|
+
pre-install-rb: Makefile
|
170
|
+
pre-install-rb-default: Makefile
|
171
|
+
pre-install-rb-default:
|
172
|
+
$(ECHO) installing default ferret_ext libraries
|
173
|
+
./.RUBYARCHDIR.time:
|
174
|
+
$(Q) $(MAKEDIRS) $(RUBYARCHDIR)
|
175
|
+
$(Q) $(TOUCH) $@
|
176
|
+
|
177
|
+
site-install: site-install-so site-install-rb
|
178
|
+
site-install-so: install-so
|
179
|
+
site-install-rb: install-rb
|
180
|
+
|
181
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
|
182
|
+
|
183
|
+
.cc.o:
|
184
|
+
$(ECHO) compiling $(<)
|
185
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
186
|
+
|
187
|
+
.mm.o:
|
188
|
+
$(ECHO) compiling $(<)
|
189
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
190
|
+
|
191
|
+
.cxx.o:
|
192
|
+
$(ECHO) compiling $(<)
|
193
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
194
|
+
|
195
|
+
.cpp.o:
|
196
|
+
$(ECHO) compiling $(<)
|
197
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
198
|
+
|
199
|
+
.C.o:
|
200
|
+
$(ECHO) compiling $(<)
|
201
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
202
|
+
|
203
|
+
.c.o:
|
204
|
+
$(ECHO) compiling $(<)
|
205
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
206
|
+
|
207
|
+
.m.o:
|
208
|
+
$(ECHO) compiling $(<)
|
209
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
210
|
+
|
211
|
+
$(DLLIB): $(OBJS) Makefile
|
212
|
+
$(ECHO) linking shared-object $(DLLIB)
|
213
|
+
-$(Q)$(RM) $(@)
|
214
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
$(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
|