watson-acts_as_ferret 0.4.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README +104 -0
- data/acts_as_ferret.gemspec +58 -0
- data/bin/aaf_install +29 -0
- data/config/ferret_server.yml +24 -0
- data/doc/README.win32 +23 -0
- data/doc/demo/README +154 -0
- data/doc/demo/README_DEMO +23 -0
- data/doc/demo/Rakefile +10 -0
- data/doc/demo/app/controllers/admin/backend_controller.rb +14 -0
- data/doc/demo/app/controllers/admin_area_controller.rb +4 -0
- data/doc/demo/app/controllers/application.rb +5 -0
- data/doc/demo/app/controllers/contents_controller.rb +49 -0
- data/doc/demo/app/controllers/searches_controller.rb +8 -0
- data/doc/demo/app/helpers/admin/backend_helper.rb +2 -0
- data/doc/demo/app/helpers/application_helper.rb +3 -0
- data/doc/demo/app/helpers/content_helper.rb +2 -0
- data/doc/demo/app/helpers/search_helper.rb +2 -0
- data/doc/demo/app/models/comment.rb +48 -0
- data/doc/demo/app/models/content.rb +12 -0
- data/doc/demo/app/models/content_base.rb +28 -0
- data/doc/demo/app/models/search.rb +19 -0
- data/doc/demo/app/models/shared_index1.rb +3 -0
- data/doc/demo/app/models/shared_index2.rb +3 -0
- data/doc/demo/app/models/special_content.rb +3 -0
- data/doc/demo/app/models/stats.rb +20 -0
- data/doc/demo/app/views/admin/backend/search.rhtml +18 -0
- data/doc/demo/app/views/contents/_form.rhtml +10 -0
- data/doc/demo/app/views/contents/edit.rhtml +9 -0
- data/doc/demo/app/views/contents/index.rhtml +24 -0
- data/doc/demo/app/views/contents/new.rhtml +8 -0
- data/doc/demo/app/views/contents/show.rhtml +8 -0
- data/doc/demo/app/views/layouts/application.html.erb +17 -0
- data/doc/demo/app/views/searches/_content.html.erb +2 -0
- data/doc/demo/app/views/searches/search.html.erb +20 -0
- data/doc/demo/config/boot.rb +109 -0
- data/doc/demo/config/database.yml +38 -0
- data/doc/demo/config/environment.rb +69 -0
- data/doc/demo/config/environments/development.rb +16 -0
- data/doc/demo/config/environments/production.rb +19 -0
- data/doc/demo/config/environments/test.rb +21 -0
- data/doc/demo/config/ferret_server.yml +18 -0
- data/doc/demo/config/lighttpd.conf +40 -0
- data/doc/demo/config/routes.rb +9 -0
- data/doc/demo/db/development_structure.sql +15 -0
- data/doc/demo/db/migrate/001_initial_migration.rb +18 -0
- data/doc/demo/db/migrate/002_add_type_to_contents.rb +9 -0
- data/doc/demo/db/migrate/003_create_shared_index1s.rb +11 -0
- data/doc/demo/db/migrate/004_create_shared_index2s.rb +11 -0
- data/doc/demo/db/migrate/005_special_field.rb +9 -0
- data/doc/demo/db/migrate/006_create_stats.rb +15 -0
- data/doc/demo/db/schema.sql +18 -0
- data/doc/demo/db/schema.sqlite +14 -0
- data/doc/demo/doc/README_FOR_APP +2 -0
- data/doc/demo/doc/howto.txt +70 -0
- data/doc/demo/public/404.html +8 -0
- data/doc/demo/public/500.html +8 -0
- data/doc/demo/public/dispatch.cgi +10 -0
- data/doc/demo/public/dispatch.fcgi +24 -0
- data/doc/demo/public/dispatch.rb +10 -0
- data/doc/demo/public/favicon.ico +0 -0
- data/doc/demo/public/images/rails.png +0 -0
- data/doc/demo/public/index.html +277 -0
- data/doc/demo/public/robots.txt +1 -0
- data/doc/demo/public/stylesheets/scaffold.css +74 -0
- data/doc/demo/script/about +3 -0
- data/doc/demo/script/breakpointer +3 -0
- data/doc/demo/script/console +3 -0
- data/doc/demo/script/destroy +3 -0
- data/doc/demo/script/ferret_server +10 -0
- data/doc/demo/script/generate +3 -0
- data/doc/demo/script/performance/benchmarker +3 -0
- data/doc/demo/script/performance/profiler +3 -0
- data/doc/demo/script/plugin +3 -0
- data/doc/demo/script/process/inspector +3 -0
- data/doc/demo/script/process/reaper +3 -0
- data/doc/demo/script/process/spawner +3 -0
- data/doc/demo/script/process/spinner +3 -0
- data/doc/demo/script/runner +3 -0
- data/doc/demo/script/server +3 -0
- data/doc/demo/test/fixtures/comments.yml +12 -0
- data/doc/demo/test/fixtures/contents.yml +13 -0
- data/doc/demo/test/fixtures/remote_contents.yml +9 -0
- data/doc/demo/test/fixtures/shared_index1s.yml +7 -0
- data/doc/demo/test/fixtures/shared_index2s.yml +7 -0
- data/doc/demo/test/functional/admin/backend_controller_test.rb +35 -0
- data/doc/demo/test/functional/contents_controller_test.rb +81 -0
- data/doc/demo/test/functional/searches_controller_test.rb +71 -0
- data/doc/demo/test/smoke/drb_smoke_test.rb +321 -0
- data/doc/demo/test/smoke/process_stats.rb +21 -0
- data/doc/demo/test/test_helper.rb +30 -0
- data/doc/demo/test/unit/comment_test.rb +217 -0
- data/doc/demo/test/unit/content_test.rb +705 -0
- data/doc/demo/test/unit/ferret_result_test.rb +24 -0
- data/doc/demo/test/unit/multi_index_test.rb +329 -0
- data/doc/demo/test/unit/remote_index_test.rb +23 -0
- data/doc/demo/test/unit/shared_index1_test.rb +108 -0
- data/doc/demo/test/unit/shared_index2_test.rb +13 -0
- data/doc/demo/test/unit/sort_test.rb +21 -0
- data/doc/demo/test/unit/special_content_test.rb +25 -0
- data/doc/demo/vendor/plugins/will_paginate/LICENSE +18 -0
- data/doc/demo/vendor/plugins/will_paginate/README +108 -0
- data/doc/demo/vendor/plugins/will_paginate/Rakefile +23 -0
- data/doc/demo/vendor/plugins/will_paginate/init.rb +21 -0
- data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/collection.rb +45 -0
- data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/core_ext.rb +44 -0
- data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/finder.rb +159 -0
- data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/view_helpers.rb +95 -0
- data/doc/demo/vendor/plugins/will_paginate/test/array_pagination_test.rb +23 -0
- data/doc/demo/vendor/plugins/will_paginate/test/boot.rb +27 -0
- data/doc/demo/vendor/plugins/will_paginate/test/console +10 -0
- data/doc/demo/vendor/plugins/will_paginate/test/finder_test.rb +219 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/admin.rb +3 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/companies.yml +24 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/company.rb +23 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/developer.rb +11 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/developers_projects.yml +13 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/project.rb +4 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/projects.yml +7 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/replies.yml +20 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/reply.rb +5 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/schema.sql +44 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/topic.rb +19 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/topics.yml +30 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/user.rb +2 -0
- data/doc/demo/vendor/plugins/will_paginate/test/fixtures/users.yml +35 -0
- data/doc/demo/vendor/plugins/will_paginate/test/helper.rb +42 -0
- data/doc/demo/vendor/plugins/will_paginate/test/lib/activerecord_test_connector.rb +64 -0
- data/doc/demo/vendor/plugins/will_paginate/test/lib/load_fixtures.rb +10 -0
- data/doc/demo/vendor/plugins/will_paginate/test/pagination_test.rb +136 -0
- data/doc/monit-example +22 -0
- data/init.rb +24 -0
- data/install.rb +18 -0
- data/lib/act_methods.rb +147 -0
- data/lib/acts_as_ferret.rb +593 -0
- data/lib/ar_mysql_auto_reconnect_patch.rb +41 -0
- data/lib/blank_slate.rb +54 -0
- data/lib/bulk_indexer.rb +56 -0
- data/lib/class_methods.rb +279 -0
- data/lib/ferret_extensions.rb +192 -0
- data/lib/ferret_find_methods.rb +142 -0
- data/lib/ferret_result.rb +58 -0
- data/lib/ferret_server.rb +238 -0
- data/lib/index.rb +99 -0
- data/lib/instance_methods.rb +172 -0
- data/lib/local_index.rb +202 -0
- data/lib/more_like_this.rb +217 -0
- data/lib/multi_index.rb +133 -0
- data/lib/rdig_adapter.rb +149 -0
- data/lib/remote_functions.rb +43 -0
- data/lib/remote_index.rb +54 -0
- data/lib/remote_multi_index.rb +20 -0
- data/lib/search_results.rb +50 -0
- data/lib/server_manager.rb +71 -0
- data/lib/unix_daemon.rb +86 -0
- data/lib/without_ar.rb +52 -0
- data/recipes/aaf_recipes.rb +116 -0
- data/script/ferret_daemon +94 -0
- data/script/ferret_server +12 -0
- data/script/ferret_service +178 -0
- data/tasks/ferret.rake +39 -0
- metadata +246 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
module ActsAsFerret #:nodoc:
|
2
|
+
|
3
|
+
module InstanceMethods
|
4
|
+
include ResultAttributes
|
5
|
+
|
6
|
+
# Returns an array of strings with the matches highlighted. The +query+ can
|
7
|
+
# either be a String or a Ferret::Search::Query object.
|
8
|
+
#
|
9
|
+
# === Options
|
10
|
+
#
|
11
|
+
# field:: field to take the content from. This field has
|
12
|
+
# to have it's content stored in the index
|
13
|
+
# (:store => :yes in your call to aaf). If not
|
14
|
+
# given, all stored fields are searched, and the
|
15
|
+
# highlighted content found in all of them is returned.
|
16
|
+
# set :highlight => :no in the field options to
|
17
|
+
# avoid highlighting of contents from a :stored field.
|
18
|
+
# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
|
19
|
+
# terms will be in the centre of the excerpt.
|
20
|
+
# num_excerpts:: Default: 2. Number of excerpts to return.
|
21
|
+
# pre_tag:: Default: "<em>". Tag to place to the left of the
|
22
|
+
# match.
|
23
|
+
# post_tag:: Default: "</em>". This tag should close the
|
24
|
+
# +:pre_tag+.
|
25
|
+
# ellipsis:: Default: "...". This is the string that is appended
|
26
|
+
# at the beginning and end of excerpts (unless the
|
27
|
+
# excerpt hits the start or end of the field. You'll
|
28
|
+
# probably want to change this to a Unicode elipsis
|
29
|
+
# character.
|
30
|
+
def highlight(query, options = {})
|
31
|
+
self.class.highlight(self.ferret_key, query, options)
|
32
|
+
end
|
33
|
+
|
34
|
+
# re-eneable ferret indexing for this instance after a call to #disable_ferret
|
35
|
+
def enable_ferret
|
36
|
+
@ferret_disabled = nil
|
37
|
+
end
|
38
|
+
alias ferret_enable enable_ferret # compatibility
|
39
|
+
|
40
|
+
# returns true if ferret indexing is enabled for this record.
|
41
|
+
#
|
42
|
+
# The optional is_bulk_index parameter will be true if the method is called
|
43
|
+
# by rebuild_index or bulk_index, and false otherwise.
|
44
|
+
#
|
45
|
+
# If is_bulk_index is true, the class level ferret_enabled state will be
|
46
|
+
# ignored by this method (per-instance ferret_enabled checks however will
|
47
|
+
# take place, so if you override this method to forbid indexing of certain
|
48
|
+
# records you're still safe).
|
49
|
+
def ferret_enabled?(is_bulk_index = false)
|
50
|
+
@ferret_disabled.nil? && (is_bulk_index || self.class.ferret_enabled?) && (aaf_configuration[:if].nil? || aaf_configuration[:if].call(self))
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the analyzer to use when adding this record to the index.
|
54
|
+
#
|
55
|
+
# Override to return a specific analyzer for any record that is to be
|
56
|
+
# indexed, i.e. specify a different analyzer based on language. Returns nil
|
57
|
+
# by default so the global analyzer (specified with the acts_as_ferret
|
58
|
+
# call) is used.
|
59
|
+
def ferret_analyzer
|
60
|
+
nil
|
61
|
+
end
|
62
|
+
|
63
|
+
# Disable Ferret for this record for a specified amount of time. ::once will
|
64
|
+
# disable Ferret for the next call to #save (this is the default), ::always
|
65
|
+
# will do so for all subsequent calls.
|
66
|
+
#
|
67
|
+
# Note that this will turn off only the create and update hooks, but not the
|
68
|
+
# destroy hook. I think that's reasonable, if you think the opposite, please
|
69
|
+
# tell me.
|
70
|
+
#
|
71
|
+
# To manually trigger reindexing of a record after you're finished modifying
|
72
|
+
# it, you can call #ferret_update directly instead of #save (remember to
|
73
|
+
# enable ferret again before).
|
74
|
+
#
|
75
|
+
# When given a block, this will be executed without any ferret indexing of
|
76
|
+
# this object taking place. The optional argument in this case can be used
|
77
|
+
# to indicate if the object should be indexed after executing the block
|
78
|
+
# (::index_when_finished). Automatic Ferret indexing of this object will be
|
79
|
+
# turned on after the block has been executed. If passed ::index_when_true,
|
80
|
+
# the index will only be updated if the block evaluated not to false or nil.
|
81
|
+
#
|
82
|
+
def disable_ferret(option = :once)
|
83
|
+
if block_given?
|
84
|
+
@ferret_disabled = :always
|
85
|
+
result = yield
|
86
|
+
ferret_enable
|
87
|
+
ferret_update if option == :index_when_finished || (option == :index_when_true && result)
|
88
|
+
result
|
89
|
+
elsif [:once, :always].include?(option)
|
90
|
+
@ferret_disabled = option
|
91
|
+
else
|
92
|
+
raise ArgumentError.new("Invalid Argument #{option}")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# add to index
|
97
|
+
def ferret_create
|
98
|
+
if ferret_enabled?
|
99
|
+
logger.debug "ferret_create/update: #{self.ferret_key}"
|
100
|
+
self.class.aaf_index << self
|
101
|
+
else
|
102
|
+
ferret_enable if @ferret_disabled == :once
|
103
|
+
end
|
104
|
+
true # signal success to AR
|
105
|
+
end
|
106
|
+
alias :ferret_update :ferret_create
|
107
|
+
|
108
|
+
|
109
|
+
# remove from index
|
110
|
+
def ferret_destroy
|
111
|
+
logger.debug "ferret_destroy: #{self.ferret_key}"
|
112
|
+
begin
|
113
|
+
self.class.aaf_index.remove self.ferret_key
|
114
|
+
rescue
|
115
|
+
logger.warn("Could not find indexed value for this object: #{$!}\n#{$!.backtrace}")
|
116
|
+
end
|
117
|
+
true # signal success to AR
|
118
|
+
end
|
119
|
+
|
120
|
+
def ferret_key
|
121
|
+
"#{self.class.name}-#{self.send self.class.primary_key}" unless new_record?
|
122
|
+
end
|
123
|
+
|
124
|
+
# turn this instance into a ferret document (which basically is a hash of
|
125
|
+
# fieldname => value pairs)
|
126
|
+
def to_doc
|
127
|
+
logger.debug "creating doc for class: #{self.ferret_key}"
|
128
|
+
Ferret::Document.new.tap do |doc|
|
129
|
+
# store the id and class name of each item, and the unique key used for identifying the record
|
130
|
+
# even in multi-class indexes.
|
131
|
+
doc[:key] = self.ferret_key
|
132
|
+
doc[:id] = self.id.to_s
|
133
|
+
doc[:class_name] = self.class.name
|
134
|
+
|
135
|
+
# iterate through the fields and add them to the document
|
136
|
+
aaf_configuration[:defined_fields].each_pair do |field, config|
|
137
|
+
doc[field] = self.send("#{field}_to_ferret") unless config[:ignore]
|
138
|
+
end
|
139
|
+
if aaf_configuration[:boost]
|
140
|
+
if self.respond_to?(aaf_configuration[:boost])
|
141
|
+
boost = self.send aaf_configuration[:boost]
|
142
|
+
doc.boost = boost.to_i if boost
|
143
|
+
else
|
144
|
+
logger.error "boost option should point to an instance method: #{aaf_configuration[:boost]}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def document_number
|
151
|
+
self.class.aaf_index.document_number(self.ferret_key)
|
152
|
+
end
|
153
|
+
|
154
|
+
def query_for_record
|
155
|
+
self.class.aaf_index.query_for_record(self.ferret_key)
|
156
|
+
end
|
157
|
+
|
158
|
+
def content_for_field_name(field, via = field, dynamic_boost = nil)
|
159
|
+
field_data = (respond_to?(via) ? send(via) : instance_variable_get("@#{via}"))
|
160
|
+
field_data = (field_data.is_a?(Array) ? field_data.map{|d| d.to_s} : field_data.to_s)
|
161
|
+
# field_data = self.send(via) || self.instance_variable_get("@#{via}")
|
162
|
+
if (dynamic_boost && boost_value = self.send(dynamic_boost))
|
163
|
+
field_data = Ferret::Field.new(field_data)
|
164
|
+
field_data.boost = boost_value.to_i
|
165
|
+
end
|
166
|
+
field_data
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
data/lib/local_index.rb
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
module ActsAsFerret
|
2
|
+
class LocalIndex < AbstractIndex
|
3
|
+
include MoreLikeThis::IndexMethods
|
4
|
+
|
5
|
+
def initialize(index_name)
|
6
|
+
super
|
7
|
+
ensure_index_exists
|
8
|
+
end
|
9
|
+
|
10
|
+
def reopen!
|
11
|
+
logger.debug "reopening index at #{index_definition[:ferret][:path]}"
|
12
|
+
close
|
13
|
+
ferret_index
|
14
|
+
end
|
15
|
+
|
16
|
+
# The 'real' Ferret Index instance
|
17
|
+
def ferret_index
|
18
|
+
ensure_index_exists
|
19
|
+
(@ferret_index ||= Ferret::Index::Index.new(index_definition[:ferret])).tap do |idx|
|
20
|
+
idx.batch_size = index_definition[:reindex_batch_size]
|
21
|
+
idx.logger = logger
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Checks for the presence of a segments file in the index directory
|
26
|
+
# Rebuilds the index if none exists.
|
27
|
+
def ensure_index_exists
|
28
|
+
#logger.debug "LocalIndex: ensure_index_exists at #{index_definition[:index_dir]}"
|
29
|
+
unless File.file? "#{index_definition[:index_dir]}/segments"
|
30
|
+
ActsAsFerret::ensure_directory(index_definition[:index_dir])
|
31
|
+
rebuild_index
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Closes the underlying index instance
|
36
|
+
def close
|
37
|
+
@ferret_index.close if @ferret_index
|
38
|
+
rescue StandardError
|
39
|
+
# is raised when index already closed
|
40
|
+
ensure
|
41
|
+
@ferret_index = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
# rebuilds the index from all records of the model classes associated with this index
|
45
|
+
def rebuild_index
|
46
|
+
models = index_definition[:registered_models]
|
47
|
+
logger.debug "rebuild index with models: #{models.inspect}"
|
48
|
+
close
|
49
|
+
index = Ferret::Index::Index.new(index_definition[:ferret].dup.update(:auto_flush => false,
|
50
|
+
:field_infos => ActsAsFerret::field_infos(index_definition),
|
51
|
+
:create => true))
|
52
|
+
index.batch_size = index_definition[:reindex_batch_size]
|
53
|
+
index.logger = logger
|
54
|
+
index.index_models models
|
55
|
+
reopen!
|
56
|
+
end
|
57
|
+
|
58
|
+
def bulk_index(class_name, ids, options)
|
59
|
+
ferret_index.bulk_index(class_name.constantize, ids, options)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Parses the given query string into a Ferret Query object.
|
63
|
+
def process_query(query, options = {})
|
64
|
+
return query unless String === query
|
65
|
+
ferret_index.synchronize do
|
66
|
+
if options[:analyzer]
|
67
|
+
# use per-query analyzer if present
|
68
|
+
qp = Ferret::QueryParser.new ferret_index.instance_variable_get('@options').merge(options)
|
69
|
+
reader = ferret_index.reader
|
70
|
+
qp.fields =
|
71
|
+
reader.fields unless options[:all_fields] || options[:fields]
|
72
|
+
qp.tokenized_fields =
|
73
|
+
reader.tokenized_fields unless options[:tokenized_fields]
|
74
|
+
return qp.parse query
|
75
|
+
else
|
76
|
+
return ferret_index.process_query(query)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Total number of hits for the given query.
|
82
|
+
def total_hits(query, options = {})
|
83
|
+
ferret_index.search(process_query(query, options), options).total_hits
|
84
|
+
end
|
85
|
+
|
86
|
+
def searcher
|
87
|
+
ferret_index
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
######################################
|
92
|
+
# methods working on a single record
|
93
|
+
# called from instance_methods, here to simplify interfacing with the
|
94
|
+
# remote ferret server
|
95
|
+
# TODO having to pass id and class_name around like this isn't nice
|
96
|
+
######################################
|
97
|
+
|
98
|
+
# add record to index
|
99
|
+
# record may be the full AR object, a Ferret document instance or a Hash
|
100
|
+
def add(record, analyzer = nil)
|
101
|
+
unless Hash === record || Ferret::Document === record
|
102
|
+
analyzer = record.ferret_analyzer
|
103
|
+
record = record.to_doc
|
104
|
+
end
|
105
|
+
ferret_index.add_document(record, analyzer)
|
106
|
+
end
|
107
|
+
alias << add
|
108
|
+
|
109
|
+
# delete record from index
|
110
|
+
def remove(key)
|
111
|
+
ferret_index.delete key
|
112
|
+
end
|
113
|
+
|
114
|
+
# highlight search terms for the record with the given id.
|
115
|
+
def highlight(key, query, options = {})
|
116
|
+
logger.debug("highlight: #{key} query: #{query}")
|
117
|
+
options.reverse_merge! :num_excerpts => 2, :pre_tag => '<em>', :post_tag => '</em>'
|
118
|
+
highlights = []
|
119
|
+
ferret_index.synchronize do
|
120
|
+
doc_num = document_number(key)
|
121
|
+
|
122
|
+
if options[:field]
|
123
|
+
highlights << ferret_index.highlight(query, doc_num, options)
|
124
|
+
else
|
125
|
+
query = process_query(query) # process only once
|
126
|
+
index_definition[:ferret_fields].each_pair do |field, config|
|
127
|
+
next if config[:store] == :no || config[:highlight] == :no
|
128
|
+
options[:field] = field
|
129
|
+
highlights << ferret_index.highlight(query, doc_num, options)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
return highlights.compact.flatten[0..options[:num_excerpts]-1]
|
134
|
+
end
|
135
|
+
|
136
|
+
# retrieves the ferret document number of the record with the given key.
|
137
|
+
def document_number(key)
|
138
|
+
docnum = ferret_index.doc_number(key)
|
139
|
+
# hits = ferret_index.search query_for_record(key)
|
140
|
+
# return hits.hits.first.doc if hits.total_hits == 1
|
141
|
+
raise "cannot determine document number for record #{key}" if docnum.nil?
|
142
|
+
docnum
|
143
|
+
end
|
144
|
+
|
145
|
+
# build a ferret query matching only the record with the given id
|
146
|
+
# the class name only needs to be given in case of a shared index configuration
|
147
|
+
def query_for_record(key)
|
148
|
+
return Ferret::Search::TermQuery.new(:key, key.to_s)
|
149
|
+
# if shared?
|
150
|
+
# raise InvalidArgumentError.new("shared index needs class_name argument") if class_name.nil?
|
151
|
+
# Ferret::Search::BooleanQuery.new.tap do |bq|
|
152
|
+
# bq.add_query(Ferret::Search::TermQuery.new(:id, id.to_s), :must)
|
153
|
+
# bq.add_query(Ferret::Search::TermQuery.new(:class_name, class_name), :must)
|
154
|
+
# end
|
155
|
+
# else
|
156
|
+
# Ferret::Search::TermQuery.new(:id, id.to_s)
|
157
|
+
# end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
# retrieves stored fields from index definition in case the fields to retrieve
|
162
|
+
# haven't been specified with the :lazy option
|
163
|
+
def determine_stored_fields(options = {})
|
164
|
+
stored_fields = options[:lazy]
|
165
|
+
if stored_fields && !(Array === stored_fields)
|
166
|
+
stored_fields = index_definition[:ferret_fields].select { |field, config| config[:store] == :yes }.map(&:first)
|
167
|
+
end
|
168
|
+
logger.debug "stored_fields: #{stored_fields.inspect}"
|
169
|
+
return stored_fields
|
170
|
+
end
|
171
|
+
|
172
|
+
# loads data for fields declared as :lazy from the Ferret document
|
173
|
+
def extract_stored_fields(doc, stored_fields)
|
174
|
+
data = {}
|
175
|
+
unless stored_fields.nil?
|
176
|
+
logger.debug "extracting stored fields #{stored_fields.inspect} from document #{doc[:class_name]} / #{doc[:id]}"
|
177
|
+
fields = index_definition[:ferret_fields]
|
178
|
+
stored_fields.each do |field|
|
179
|
+
if field_cfg = fields[field]
|
180
|
+
data[field_cfg[:via]] = doc[field]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
logger.debug "done: #{data.inspect}"
|
184
|
+
end
|
185
|
+
return data
|
186
|
+
end
|
187
|
+
|
188
|
+
protected
|
189
|
+
|
190
|
+
# returns a MultiIndex instance operating on a MultiReader
|
191
|
+
#def multi_index(model_classes)
|
192
|
+
# model_classes.map!(&:constantize) if String === model_classes.first
|
193
|
+
# model_classes.sort! { |a, b| a.name <=> b.name }
|
194
|
+
# key = model_classes.inject("") { |s, clazz| s + clazz.name }
|
195
|
+
# multi_config = index_definition[:ferret].dup
|
196
|
+
# multi_config.delete :default_field # we don't want the default field list of *this* class for multi_searching
|
197
|
+
# ActsAsFerret::multi_indexes[key] ||= MultiIndex.new(model_classes, multi_config)
|
198
|
+
#end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
@@ -0,0 +1,217 @@
|
|
1
|
+
module ActsAsFerret #:nodoc:
|
2
|
+
|
3
|
+
module MoreLikeThis
|
4
|
+
|
5
|
+
module InstanceMethods
|
6
|
+
|
7
|
+
# returns other instances of this class, which have similar contents
|
8
|
+
# like this one. Basically works like this: find out n most interesting
|
9
|
+
# (i.e. characteristic) terms from this document, and then build a
|
10
|
+
# query from those which is run against the whole index. Which terms
|
11
|
+
# are interesting is decided on variour criteria which can be
|
12
|
+
# influenced by the given options.
|
13
|
+
#
|
14
|
+
# The algorithm used here is a quite straight port of the MoreLikeThis class
|
15
|
+
# from Apache Lucene.
|
16
|
+
#
|
17
|
+
# options are:
|
18
|
+
# :field_names : Array of field names to use for similarity search (mandatory)
|
19
|
+
# :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
|
20
|
+
# :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
|
21
|
+
# :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
|
22
|
+
# be more characteristic for the document they occur in).
|
23
|
+
# :max_word_length => nil, # Ignore words if greater than this len.
|
24
|
+
# :max_query_terms => 25, # maximum number of terms in the query built
|
25
|
+
# :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
|
26
|
+
# :boost => false, # when true, a boost according to the relative score of
|
27
|
+
# a term is applied to this Term's TermQuery.
|
28
|
+
# :similarity => 'DefaultAAFSimilarity' # the similarity implementation to use (the default
|
29
|
+
# equals Ferret's internal similarity implementation)
|
30
|
+
# :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
|
31
|
+
# :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
|
32
|
+
# ferret_options : Ferret options handed over to find_with_ferret (i.e. for limits and sorting)
|
33
|
+
# ar_options : options handed over to find_with_ferret for AR scoping
|
34
|
+
def more_like_this(options = {}, ferret_options = {}, ar_options = {})
|
35
|
+
options = {
|
36
|
+
:field_names => nil, # Default field names
|
37
|
+
:min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
|
38
|
+
:min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
|
39
|
+
:min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
|
40
|
+
:max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
|
41
|
+
:max_query_terms => 25, # maximum number of terms in the query built
|
42
|
+
:max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
|
43
|
+
:boost => false,
|
44
|
+
:similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity', # class name of the similarity implementation to use
|
45
|
+
:analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
|
46
|
+
:append_to_query => nil,
|
47
|
+
:base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_with_ferret can be used to retrieve results from other classes, too
|
48
|
+
}.update(options)
|
49
|
+
#index.search_each('id:*') do |doc, score|
|
50
|
+
# puts "#{doc} == #{index[doc][:description]}"
|
51
|
+
#end
|
52
|
+
clazz = options[:base_class]
|
53
|
+
options[:base_class] = clazz.name
|
54
|
+
query = clazz.aaf_index.build_more_like_this_query(self.ferret_key, self.id, options)
|
55
|
+
options[:append_to_query].call(query) if options[:append_to_query]
|
56
|
+
clazz.find_with_ferret(query, ferret_options, ar_options)
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
module IndexMethods
|
62
|
+
|
63
|
+
# TODO to allow morelikethis for unsaved records, we have to give the
|
64
|
+
# unsaved record's data to this method. check how this will work out
|
65
|
+
# via drb...
|
66
|
+
def build_more_like_this_query(key, id, options)
|
67
|
+
[:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
|
68
|
+
ferret_index.synchronize do # avoid that concurrent writes close our reader
|
69
|
+
ferret_index.send(:ensure_reader_open)
|
70
|
+
reader = ferret_index.send(:reader)
|
71
|
+
term_freq_map = retrieve_terms(key, id, reader, options)
|
72
|
+
priority_queue = create_queue(term_freq_map, reader, options)
|
73
|
+
create_query(key, priority_queue, options)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def create_query(key, priority_queue, options={})
|
80
|
+
query = Ferret::Search::BooleanQuery.new
|
81
|
+
qterms = 0
|
82
|
+
best_score = nil
|
83
|
+
while(cur = priority_queue.pop)
|
84
|
+
term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
|
85
|
+
|
86
|
+
if options[:boost]
|
87
|
+
# boost term according to relative score
|
88
|
+
# TODO untested
|
89
|
+
best_score ||= cur.score
|
90
|
+
term_query.boost = cur.score / best_score
|
91
|
+
end
|
92
|
+
begin
|
93
|
+
query.add_query(term_query, :should)
|
94
|
+
rescue Ferret::Search::BooleanQuery::TooManyClauses
|
95
|
+
break
|
96
|
+
end
|
97
|
+
qterms += 1
|
98
|
+
break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
|
99
|
+
end
|
100
|
+
# exclude the original record
|
101
|
+
query.add_query(query_for_record(key), :must_not)
|
102
|
+
return query
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
# creates a term/term_frequency map for terms from the fields
|
108
|
+
# given in options[:field_names]
|
109
|
+
def retrieve_terms(key, id, reader, options)
|
110
|
+
raise "more_like_this atm only works on saved records" if key.nil?
|
111
|
+
document_number = document_number(key) rescue nil
|
112
|
+
field_names = options[:field_names]
|
113
|
+
max_num_tokens = options[:max_num_tokens]
|
114
|
+
term_freq_map = Hash.new(0)
|
115
|
+
doc = nil
|
116
|
+
record = nil
|
117
|
+
field_names.each do |field|
|
118
|
+
#puts "field: #{field}"
|
119
|
+
term_freq_vector = reader.term_vector(document_number, field) if document_number
|
120
|
+
#if false
|
121
|
+
if term_freq_vector
|
122
|
+
# use stored term vector
|
123
|
+
# puts 'using stored term vector'
|
124
|
+
term_freq_vector.terms.each do |term|
|
125
|
+
term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
|
126
|
+
end
|
127
|
+
else
|
128
|
+
# puts 'no stored term vector'
|
129
|
+
# no term vector stored, but we have stored the contents in the index
|
130
|
+
# -> extract terms from there
|
131
|
+
content = nil
|
132
|
+
if document_number
|
133
|
+
doc = reader[document_number]
|
134
|
+
content = doc[field]
|
135
|
+
end
|
136
|
+
unless content
|
137
|
+
# no term vector, no stored content, so try content from this instance
|
138
|
+
record ||= options[:base_class].constantize.find(id)
|
139
|
+
content = record.content_for_field_name(field.to_s)
|
140
|
+
end
|
141
|
+
puts "have doc: #{doc[:id]} with #{field} == #{content}"
|
142
|
+
token_count = 0
|
143
|
+
|
144
|
+
ts = options[:analyzer].token_stream(field, content)
|
145
|
+
while token = ts.next
|
146
|
+
break if (token_count+=1) > max_num_tokens
|
147
|
+
next if noise_word?(token.text, options)
|
148
|
+
term_freq_map[token.text] += 1
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
term_freq_map
|
153
|
+
end
|
154
|
+
|
155
|
+
# create an ordered(by score) list of word,fieldname,score
|
156
|
+
# structures
|
157
|
+
def create_queue(term_freq_map, reader, options)
|
158
|
+
pq = Array.new(term_freq_map.size)
|
159
|
+
|
160
|
+
similarity = options[:similarity]
|
161
|
+
num_docs = reader.num_docs
|
162
|
+
term_freq_map.each_pair do |word, tf|
|
163
|
+
# filter out words that don't occur enough times in the source
|
164
|
+
next if options[:min_term_freq] && tf < options[:min_term_freq]
|
165
|
+
|
166
|
+
# go through all the fields and find the largest document frequency
|
167
|
+
top_field = options[:field_names].first
|
168
|
+
doc_freq = 0
|
169
|
+
options[:field_names].each do |field_name|
|
170
|
+
freq = reader.doc_freq(field_name, word)
|
171
|
+
if freq > doc_freq
|
172
|
+
top_field = field_name
|
173
|
+
doc_freq = freq
|
174
|
+
end
|
175
|
+
end
|
176
|
+
# filter out words that don't occur in enough docs
|
177
|
+
next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
|
178
|
+
next if doc_freq == 0 # index update problem ?
|
179
|
+
|
180
|
+
idf = similarity.idf(doc_freq, num_docs)
|
181
|
+
score = tf * idf
|
182
|
+
pq << FrequencyQueueItem.new(word, top_field, score)
|
183
|
+
end
|
184
|
+
pq.compact!
|
185
|
+
pq.sort! { |a,b| a.score<=>b.score }
|
186
|
+
return pq
|
187
|
+
end
|
188
|
+
|
189
|
+
def noise_word?(text, options)
|
190
|
+
len = text.length
|
191
|
+
(
|
192
|
+
(options[:min_word_length] > 0 && len < options[:min_word_length]) ||
|
193
|
+
(options[:max_word_length] > 0 && len > options[:max_word_length]) ||
|
194
|
+
(options[:stop_words] && options.include?(text))
|
195
|
+
)
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
class DefaultAAFSimilarity
|
201
|
+
def idf(doc_freq, num_docs)
|
202
|
+
return 0.0 if num_docs == 0
|
203
|
+
return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
class FrequencyQueueItem
|
209
|
+
attr_reader :word, :field, :score
|
210
|
+
def initialize(word, field, score)
|
211
|
+
@word = word; @field = field; @score = score
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|