watson-acts_as_ferret 0.4.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (162) hide show
  1. data/LICENSE +20 -0
  2. data/README +104 -0
  3. data/acts_as_ferret.gemspec +58 -0
  4. data/bin/aaf_install +29 -0
  5. data/config/ferret_server.yml +24 -0
  6. data/doc/README.win32 +23 -0
  7. data/doc/demo/README +154 -0
  8. data/doc/demo/README_DEMO +23 -0
  9. data/doc/demo/Rakefile +10 -0
  10. data/doc/demo/app/controllers/admin/backend_controller.rb +14 -0
  11. data/doc/demo/app/controllers/admin_area_controller.rb +4 -0
  12. data/doc/demo/app/controllers/application.rb +5 -0
  13. data/doc/demo/app/controllers/contents_controller.rb +49 -0
  14. data/doc/demo/app/controllers/searches_controller.rb +8 -0
  15. data/doc/demo/app/helpers/admin/backend_helper.rb +2 -0
  16. data/doc/demo/app/helpers/application_helper.rb +3 -0
  17. data/doc/demo/app/helpers/content_helper.rb +2 -0
  18. data/doc/demo/app/helpers/search_helper.rb +2 -0
  19. data/doc/demo/app/models/comment.rb +48 -0
  20. data/doc/demo/app/models/content.rb +12 -0
  21. data/doc/demo/app/models/content_base.rb +28 -0
  22. data/doc/demo/app/models/search.rb +19 -0
  23. data/doc/demo/app/models/shared_index1.rb +3 -0
  24. data/doc/demo/app/models/shared_index2.rb +3 -0
  25. data/doc/demo/app/models/special_content.rb +3 -0
  26. data/doc/demo/app/models/stats.rb +20 -0
  27. data/doc/demo/app/views/admin/backend/search.rhtml +18 -0
  28. data/doc/demo/app/views/contents/_form.rhtml +10 -0
  29. data/doc/demo/app/views/contents/edit.rhtml +9 -0
  30. data/doc/demo/app/views/contents/index.rhtml +24 -0
  31. data/doc/demo/app/views/contents/new.rhtml +8 -0
  32. data/doc/demo/app/views/contents/show.rhtml +8 -0
  33. data/doc/demo/app/views/layouts/application.html.erb +17 -0
  34. data/doc/demo/app/views/searches/_content.html.erb +2 -0
  35. data/doc/demo/app/views/searches/search.html.erb +20 -0
  36. data/doc/demo/config/boot.rb +109 -0
  37. data/doc/demo/config/database.yml +38 -0
  38. data/doc/demo/config/environment.rb +69 -0
  39. data/doc/demo/config/environments/development.rb +16 -0
  40. data/doc/demo/config/environments/production.rb +19 -0
  41. data/doc/demo/config/environments/test.rb +21 -0
  42. data/doc/demo/config/ferret_server.yml +18 -0
  43. data/doc/demo/config/lighttpd.conf +40 -0
  44. data/doc/demo/config/routes.rb +9 -0
  45. data/doc/demo/db/development_structure.sql +15 -0
  46. data/doc/demo/db/migrate/001_initial_migration.rb +18 -0
  47. data/doc/demo/db/migrate/002_add_type_to_contents.rb +9 -0
  48. data/doc/demo/db/migrate/003_create_shared_index1s.rb +11 -0
  49. data/doc/demo/db/migrate/004_create_shared_index2s.rb +11 -0
  50. data/doc/demo/db/migrate/005_special_field.rb +9 -0
  51. data/doc/demo/db/migrate/006_create_stats.rb +15 -0
  52. data/doc/demo/db/schema.sql +18 -0
  53. data/doc/demo/db/schema.sqlite +14 -0
  54. data/doc/demo/doc/README_FOR_APP +2 -0
  55. data/doc/demo/doc/howto.txt +70 -0
  56. data/doc/demo/public/404.html +8 -0
  57. data/doc/demo/public/500.html +8 -0
  58. data/doc/demo/public/dispatch.cgi +10 -0
  59. data/doc/demo/public/dispatch.fcgi +24 -0
  60. data/doc/demo/public/dispatch.rb +10 -0
  61. data/doc/demo/public/favicon.ico +0 -0
  62. data/doc/demo/public/images/rails.png +0 -0
  63. data/doc/demo/public/index.html +277 -0
  64. data/doc/demo/public/robots.txt +1 -0
  65. data/doc/demo/public/stylesheets/scaffold.css +74 -0
  66. data/doc/demo/script/about +3 -0
  67. data/doc/demo/script/breakpointer +3 -0
  68. data/doc/demo/script/console +3 -0
  69. data/doc/demo/script/destroy +3 -0
  70. data/doc/demo/script/ferret_server +10 -0
  71. data/doc/demo/script/generate +3 -0
  72. data/doc/demo/script/performance/benchmarker +3 -0
  73. data/doc/demo/script/performance/profiler +3 -0
  74. data/doc/demo/script/plugin +3 -0
  75. data/doc/demo/script/process/inspector +3 -0
  76. data/doc/demo/script/process/reaper +3 -0
  77. data/doc/demo/script/process/spawner +3 -0
  78. data/doc/demo/script/process/spinner +3 -0
  79. data/doc/demo/script/runner +3 -0
  80. data/doc/demo/script/server +3 -0
  81. data/doc/demo/test/fixtures/comments.yml +12 -0
  82. data/doc/demo/test/fixtures/contents.yml +13 -0
  83. data/doc/demo/test/fixtures/remote_contents.yml +9 -0
  84. data/doc/demo/test/fixtures/shared_index1s.yml +7 -0
  85. data/doc/demo/test/fixtures/shared_index2s.yml +7 -0
  86. data/doc/demo/test/functional/admin/backend_controller_test.rb +35 -0
  87. data/doc/demo/test/functional/contents_controller_test.rb +81 -0
  88. data/doc/demo/test/functional/searches_controller_test.rb +71 -0
  89. data/doc/demo/test/smoke/drb_smoke_test.rb +321 -0
  90. data/doc/demo/test/smoke/process_stats.rb +21 -0
  91. data/doc/demo/test/test_helper.rb +30 -0
  92. data/doc/demo/test/unit/comment_test.rb +217 -0
  93. data/doc/demo/test/unit/content_test.rb +705 -0
  94. data/doc/demo/test/unit/ferret_result_test.rb +24 -0
  95. data/doc/demo/test/unit/multi_index_test.rb +329 -0
  96. data/doc/demo/test/unit/remote_index_test.rb +23 -0
  97. data/doc/demo/test/unit/shared_index1_test.rb +108 -0
  98. data/doc/demo/test/unit/shared_index2_test.rb +13 -0
  99. data/doc/demo/test/unit/sort_test.rb +21 -0
  100. data/doc/demo/test/unit/special_content_test.rb +25 -0
  101. data/doc/demo/vendor/plugins/will_paginate/LICENSE +18 -0
  102. data/doc/demo/vendor/plugins/will_paginate/README +108 -0
  103. data/doc/demo/vendor/plugins/will_paginate/Rakefile +23 -0
  104. data/doc/demo/vendor/plugins/will_paginate/init.rb +21 -0
  105. data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/collection.rb +45 -0
  106. data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/core_ext.rb +44 -0
  107. data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/finder.rb +159 -0
  108. data/doc/demo/vendor/plugins/will_paginate/lib/will_paginate/view_helpers.rb +95 -0
  109. data/doc/demo/vendor/plugins/will_paginate/test/array_pagination_test.rb +23 -0
  110. data/doc/demo/vendor/plugins/will_paginate/test/boot.rb +27 -0
  111. data/doc/demo/vendor/plugins/will_paginate/test/console +10 -0
  112. data/doc/demo/vendor/plugins/will_paginate/test/finder_test.rb +219 -0
  113. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/admin.rb +3 -0
  114. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/companies.yml +24 -0
  115. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/company.rb +23 -0
  116. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/developer.rb +11 -0
  117. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/developers_projects.yml +13 -0
  118. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/project.rb +4 -0
  119. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/projects.yml +7 -0
  120. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/replies.yml +20 -0
  121. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/reply.rb +5 -0
  122. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/schema.sql +44 -0
  123. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/topic.rb +19 -0
  124. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/topics.yml +30 -0
  125. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/user.rb +2 -0
  126. data/doc/demo/vendor/plugins/will_paginate/test/fixtures/users.yml +35 -0
  127. data/doc/demo/vendor/plugins/will_paginate/test/helper.rb +42 -0
  128. data/doc/demo/vendor/plugins/will_paginate/test/lib/activerecord_test_connector.rb +64 -0
  129. data/doc/demo/vendor/plugins/will_paginate/test/lib/load_fixtures.rb +10 -0
  130. data/doc/demo/vendor/plugins/will_paginate/test/pagination_test.rb +136 -0
  131. data/doc/monit-example +22 -0
  132. data/init.rb +24 -0
  133. data/install.rb +18 -0
  134. data/lib/act_methods.rb +147 -0
  135. data/lib/acts_as_ferret.rb +593 -0
  136. data/lib/ar_mysql_auto_reconnect_patch.rb +41 -0
  137. data/lib/blank_slate.rb +54 -0
  138. data/lib/bulk_indexer.rb +56 -0
  139. data/lib/class_methods.rb +279 -0
  140. data/lib/ferret_extensions.rb +192 -0
  141. data/lib/ferret_find_methods.rb +142 -0
  142. data/lib/ferret_result.rb +58 -0
  143. data/lib/ferret_server.rb +238 -0
  144. data/lib/index.rb +99 -0
  145. data/lib/instance_methods.rb +172 -0
  146. data/lib/local_index.rb +202 -0
  147. data/lib/more_like_this.rb +217 -0
  148. data/lib/multi_index.rb +133 -0
  149. data/lib/rdig_adapter.rb +149 -0
  150. data/lib/remote_functions.rb +43 -0
  151. data/lib/remote_index.rb +54 -0
  152. data/lib/remote_multi_index.rb +20 -0
  153. data/lib/search_results.rb +50 -0
  154. data/lib/server_manager.rb +71 -0
  155. data/lib/unix_daemon.rb +86 -0
  156. data/lib/without_ar.rb +52 -0
  157. data/recipes/aaf_recipes.rb +116 -0
  158. data/script/ferret_daemon +94 -0
  159. data/script/ferret_server +12 -0
  160. data/script/ferret_service +178 -0
  161. data/tasks/ferret.rake +39 -0
  162. metadata +246 -0
@@ -0,0 +1,172 @@
1
+ module ActsAsFerret #:nodoc:
2
+
3
+ module InstanceMethods
4
+ include ResultAttributes
5
+
6
+ # Returns an array of strings with the matches highlighted. The +query+ can
7
+ # either be a String or a Ferret::Search::Query object.
8
+ #
9
+ # === Options
10
+ #
11
+ # field:: field to take the content from. This field has
12
+ # to have it's content stored in the index
13
+ # (:store => :yes in your call to aaf). If not
14
+ # given, all stored fields are searched, and the
15
+ # highlighted content found in all of them is returned.
16
+ # set :highlight => :no in the field options to
17
+ # avoid highlighting of contents from a :stored field.
18
+ # excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
19
+ # terms will be in the centre of the excerpt.
20
+ # num_excerpts:: Default: 2. Number of excerpts to return.
21
+ # pre_tag:: Default: "<em>". Tag to place to the left of the
22
+ # match.
23
+ # post_tag:: Default: "</em>". This tag should close the
24
+ # +:pre_tag+.
25
+ # ellipsis:: Default: "...". This is the string that is appended
26
+ # at the beginning and end of excerpts (unless the
27
+ # excerpt hits the start or end of the field. You'll
28
+ # probably want to change this to a Unicode elipsis
29
+ # character.
30
+ def highlight(query, options = {})
31
+ self.class.highlight(self.ferret_key, query, options)
32
+ end
33
+
34
+ # re-eneable ferret indexing for this instance after a call to #disable_ferret
35
+ def enable_ferret
36
+ @ferret_disabled = nil
37
+ end
38
+ alias ferret_enable enable_ferret # compatibility
39
+
40
+ # returns true if ferret indexing is enabled for this record.
41
+ #
42
+ # The optional is_bulk_index parameter will be true if the method is called
43
+ # by rebuild_index or bulk_index, and false otherwise.
44
+ #
45
+ # If is_bulk_index is true, the class level ferret_enabled state will be
46
+ # ignored by this method (per-instance ferret_enabled checks however will
47
+ # take place, so if you override this method to forbid indexing of certain
48
+ # records you're still safe).
49
+ def ferret_enabled?(is_bulk_index = false)
50
+ @ferret_disabled.nil? && (is_bulk_index || self.class.ferret_enabled?) && (aaf_configuration[:if].nil? || aaf_configuration[:if].call(self))
51
+ end
52
+
53
+ # Returns the analyzer to use when adding this record to the index.
54
+ #
55
+ # Override to return a specific analyzer for any record that is to be
56
+ # indexed, i.e. specify a different analyzer based on language. Returns nil
57
+ # by default so the global analyzer (specified with the acts_as_ferret
58
+ # call) is used.
59
+ def ferret_analyzer
60
+ nil
61
+ end
62
+
63
+ # Disable Ferret for this record for a specified amount of time. ::once will
64
+ # disable Ferret for the next call to #save (this is the default), ::always
65
+ # will do so for all subsequent calls.
66
+ #
67
+ # Note that this will turn off only the create and update hooks, but not the
68
+ # destroy hook. I think that's reasonable, if you think the opposite, please
69
+ # tell me.
70
+ #
71
+ # To manually trigger reindexing of a record after you're finished modifying
72
+ # it, you can call #ferret_update directly instead of #save (remember to
73
+ # enable ferret again before).
74
+ #
75
+ # When given a block, this will be executed without any ferret indexing of
76
+ # this object taking place. The optional argument in this case can be used
77
+ # to indicate if the object should be indexed after executing the block
78
+ # (::index_when_finished). Automatic Ferret indexing of this object will be
79
+ # turned on after the block has been executed. If passed ::index_when_true,
80
+ # the index will only be updated if the block evaluated not to false or nil.
81
+ #
82
+ def disable_ferret(option = :once)
83
+ if block_given?
84
+ @ferret_disabled = :always
85
+ result = yield
86
+ ferret_enable
87
+ ferret_update if option == :index_when_finished || (option == :index_when_true && result)
88
+ result
89
+ elsif [:once, :always].include?(option)
90
+ @ferret_disabled = option
91
+ else
92
+ raise ArgumentError.new("Invalid Argument #{option}")
93
+ end
94
+ end
95
+
96
+ # add to index
97
+ def ferret_create
98
+ if ferret_enabled?
99
+ logger.debug "ferret_create/update: #{self.ferret_key}"
100
+ self.class.aaf_index << self
101
+ else
102
+ ferret_enable if @ferret_disabled == :once
103
+ end
104
+ true # signal success to AR
105
+ end
106
+ alias :ferret_update :ferret_create
107
+
108
+
109
+ # remove from index
110
+ def ferret_destroy
111
+ logger.debug "ferret_destroy: #{self.ferret_key}"
112
+ begin
113
+ self.class.aaf_index.remove self.ferret_key
114
+ rescue
115
+ logger.warn("Could not find indexed value for this object: #{$!}\n#{$!.backtrace}")
116
+ end
117
+ true # signal success to AR
118
+ end
119
+
120
+ def ferret_key
121
+ "#{self.class.name}-#{self.send self.class.primary_key}" unless new_record?
122
+ end
123
+
124
+ # turn this instance into a ferret document (which basically is a hash of
125
+ # fieldname => value pairs)
126
+ def to_doc
127
+ logger.debug "creating doc for class: #{self.ferret_key}"
128
+ Ferret::Document.new.tap do |doc|
129
+ # store the id and class name of each item, and the unique key used for identifying the record
130
+ # even in multi-class indexes.
131
+ doc[:key] = self.ferret_key
132
+ doc[:id] = self.id.to_s
133
+ doc[:class_name] = self.class.name
134
+
135
+ # iterate through the fields and add them to the document
136
+ aaf_configuration[:defined_fields].each_pair do |field, config|
137
+ doc[field] = self.send("#{field}_to_ferret") unless config[:ignore]
138
+ end
139
+ if aaf_configuration[:boost]
140
+ if self.respond_to?(aaf_configuration[:boost])
141
+ boost = self.send aaf_configuration[:boost]
142
+ doc.boost = boost.to_i if boost
143
+ else
144
+ logger.error "boost option should point to an instance method: #{aaf_configuration[:boost]}"
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ def document_number
151
+ self.class.aaf_index.document_number(self.ferret_key)
152
+ end
153
+
154
+ def query_for_record
155
+ self.class.aaf_index.query_for_record(self.ferret_key)
156
+ end
157
+
158
+ def content_for_field_name(field, via = field, dynamic_boost = nil)
159
+ field_data = (respond_to?(via) ? send(via) : instance_variable_get("@#{via}"))
160
+ field_data = (field_data.is_a?(Array) ? field_data.map{|d| d.to_s} : field_data.to_s)
161
+ # field_data = self.send(via) || self.instance_variable_get("@#{via}")
162
+ if (dynamic_boost && boost_value = self.send(dynamic_boost))
163
+ field_data = Ferret::Field.new(field_data)
164
+ field_data.boost = boost_value.to_i
165
+ end
166
+ field_data
167
+ end
168
+
169
+
170
+ end
171
+
172
+ end
@@ -0,0 +1,202 @@
1
+ module ActsAsFerret
2
+ class LocalIndex < AbstractIndex
3
+ include MoreLikeThis::IndexMethods
4
+
5
+ def initialize(index_name)
6
+ super
7
+ ensure_index_exists
8
+ end
9
+
10
+ def reopen!
11
+ logger.debug "reopening index at #{index_definition[:ferret][:path]}"
12
+ close
13
+ ferret_index
14
+ end
15
+
16
+ # The 'real' Ferret Index instance
17
+ def ferret_index
18
+ ensure_index_exists
19
+ (@ferret_index ||= Ferret::Index::Index.new(index_definition[:ferret])).tap do |idx|
20
+ idx.batch_size = index_definition[:reindex_batch_size]
21
+ idx.logger = logger
22
+ end
23
+ end
24
+
25
+ # Checks for the presence of a segments file in the index directory
26
+ # Rebuilds the index if none exists.
27
+ def ensure_index_exists
28
+ #logger.debug "LocalIndex: ensure_index_exists at #{index_definition[:index_dir]}"
29
+ unless File.file? "#{index_definition[:index_dir]}/segments"
30
+ ActsAsFerret::ensure_directory(index_definition[:index_dir])
31
+ rebuild_index
32
+ end
33
+ end
34
+
35
+ # Closes the underlying index instance
36
+ def close
37
+ @ferret_index.close if @ferret_index
38
+ rescue StandardError
39
+ # is raised when index already closed
40
+ ensure
41
+ @ferret_index = nil
42
+ end
43
+
44
+ # rebuilds the index from all records of the model classes associated with this index
45
+ def rebuild_index
46
+ models = index_definition[:registered_models]
47
+ logger.debug "rebuild index with models: #{models.inspect}"
48
+ close
49
+ index = Ferret::Index::Index.new(index_definition[:ferret].dup.update(:auto_flush => false,
50
+ :field_infos => ActsAsFerret::field_infos(index_definition),
51
+ :create => true))
52
+ index.batch_size = index_definition[:reindex_batch_size]
53
+ index.logger = logger
54
+ index.index_models models
55
+ reopen!
56
+ end
57
+
58
+ def bulk_index(class_name, ids, options)
59
+ ferret_index.bulk_index(class_name.constantize, ids, options)
60
+ end
61
+
62
+ # Parses the given query string into a Ferret Query object.
63
+ def process_query(query, options = {})
64
+ return query unless String === query
65
+ ferret_index.synchronize do
66
+ if options[:analyzer]
67
+ # use per-query analyzer if present
68
+ qp = Ferret::QueryParser.new ferret_index.instance_variable_get('@options').merge(options)
69
+ reader = ferret_index.reader
70
+ qp.fields =
71
+ reader.fields unless options[:all_fields] || options[:fields]
72
+ qp.tokenized_fields =
73
+ reader.tokenized_fields unless options[:tokenized_fields]
74
+ return qp.parse query
75
+ else
76
+ return ferret_index.process_query(query)
77
+ end
78
+ end
79
+ end
80
+
81
+ # Total number of hits for the given query.
82
+ def total_hits(query, options = {})
83
+ ferret_index.search(process_query(query, options), options).total_hits
84
+ end
85
+
86
+ def searcher
87
+ ferret_index
88
+ end
89
+
90
+
91
+ ######################################
92
+ # methods working on a single record
93
+ # called from instance_methods, here to simplify interfacing with the
94
+ # remote ferret server
95
+ # TODO having to pass id and class_name around like this isn't nice
96
+ ######################################
97
+
98
+ # add record to index
99
+ # record may be the full AR object, a Ferret document instance or a Hash
100
+ def add(record, analyzer = nil)
101
+ unless Hash === record || Ferret::Document === record
102
+ analyzer = record.ferret_analyzer
103
+ record = record.to_doc
104
+ end
105
+ ferret_index.add_document(record, analyzer)
106
+ end
107
+ alias << add
108
+
109
+ # delete record from index
110
+ def remove(key)
111
+ ferret_index.delete key
112
+ end
113
+
114
+ # highlight search terms for the record with the given id.
115
+ def highlight(key, query, options = {})
116
+ logger.debug("highlight: #{key} query: #{query}")
117
+ options.reverse_merge! :num_excerpts => 2, :pre_tag => '<em>', :post_tag => '</em>'
118
+ highlights = []
119
+ ferret_index.synchronize do
120
+ doc_num = document_number(key)
121
+
122
+ if options[:field]
123
+ highlights << ferret_index.highlight(query, doc_num, options)
124
+ else
125
+ query = process_query(query) # process only once
126
+ index_definition[:ferret_fields].each_pair do |field, config|
127
+ next if config[:store] == :no || config[:highlight] == :no
128
+ options[:field] = field
129
+ highlights << ferret_index.highlight(query, doc_num, options)
130
+ end
131
+ end
132
+ end
133
+ return highlights.compact.flatten[0..options[:num_excerpts]-1]
134
+ end
135
+
136
+ # retrieves the ferret document number of the record with the given key.
137
+ def document_number(key)
138
+ docnum = ferret_index.doc_number(key)
139
+ # hits = ferret_index.search query_for_record(key)
140
+ # return hits.hits.first.doc if hits.total_hits == 1
141
+ raise "cannot determine document number for record #{key}" if docnum.nil?
142
+ docnum
143
+ end
144
+
145
+ # build a ferret query matching only the record with the given id
146
+ # the class name only needs to be given in case of a shared index configuration
147
+ def query_for_record(key)
148
+ return Ferret::Search::TermQuery.new(:key, key.to_s)
149
+ # if shared?
150
+ # raise InvalidArgumentError.new("shared index needs class_name argument") if class_name.nil?
151
+ # Ferret::Search::BooleanQuery.new.tap do |bq|
152
+ # bq.add_query(Ferret::Search::TermQuery.new(:id, id.to_s), :must)
153
+ # bq.add_query(Ferret::Search::TermQuery.new(:class_name, class_name), :must)
154
+ # end
155
+ # else
156
+ # Ferret::Search::TermQuery.new(:id, id.to_s)
157
+ # end
158
+ end
159
+
160
+
161
+ # retrieves stored fields from index definition in case the fields to retrieve
162
+ # haven't been specified with the :lazy option
163
+ def determine_stored_fields(options = {})
164
+ stored_fields = options[:lazy]
165
+ if stored_fields && !(Array === stored_fields)
166
+ stored_fields = index_definition[:ferret_fields].select { |field, config| config[:store] == :yes }.map(&:first)
167
+ end
168
+ logger.debug "stored_fields: #{stored_fields.inspect}"
169
+ return stored_fields
170
+ end
171
+
172
+ # loads data for fields declared as :lazy from the Ferret document
173
+ def extract_stored_fields(doc, stored_fields)
174
+ data = {}
175
+ unless stored_fields.nil?
176
+ logger.debug "extracting stored fields #{stored_fields.inspect} from document #{doc[:class_name]} / #{doc[:id]}"
177
+ fields = index_definition[:ferret_fields]
178
+ stored_fields.each do |field|
179
+ if field_cfg = fields[field]
180
+ data[field_cfg[:via]] = doc[field]
181
+ end
182
+ end
183
+ logger.debug "done: #{data.inspect}"
184
+ end
185
+ return data
186
+ end
187
+
188
+ protected
189
+
190
+ # returns a MultiIndex instance operating on a MultiReader
191
+ #def multi_index(model_classes)
192
+ # model_classes.map!(&:constantize) if String === model_classes.first
193
+ # model_classes.sort! { |a, b| a.name <=> b.name }
194
+ # key = model_classes.inject("") { |s, clazz| s + clazz.name }
195
+ # multi_config = index_definition[:ferret].dup
196
+ # multi_config.delete :default_field # we don't want the default field list of *this* class for multi_searching
197
+ # ActsAsFerret::multi_indexes[key] ||= MultiIndex.new(model_classes, multi_config)
198
+ #end
199
+
200
+ end
201
+
202
+ end
@@ -0,0 +1,217 @@
1
+ module ActsAsFerret #:nodoc:
2
+
3
+ module MoreLikeThis
4
+
5
+ module InstanceMethods
6
+
7
+ # returns other instances of this class, which have similar contents
8
+ # like this one. Basically works like this: find out n most interesting
9
+ # (i.e. characteristic) terms from this document, and then build a
10
+ # query from those which is run against the whole index. Which terms
11
+ # are interesting is decided on variour criteria which can be
12
+ # influenced by the given options.
13
+ #
14
+ # The algorithm used here is a quite straight port of the MoreLikeThis class
15
+ # from Apache Lucene.
16
+ #
17
+ # options are:
18
+ # :field_names : Array of field names to use for similarity search (mandatory)
19
+ # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
20
+ # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
21
+ # :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
22
+ # be more characteristic for the document they occur in).
23
+ # :max_word_length => nil, # Ignore words if greater than this len.
24
+ # :max_query_terms => 25, # maximum number of terms in the query built
25
+ # :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
26
+ # :boost => false, # when true, a boost according to the relative score of
27
+ # a term is applied to this Term's TermQuery.
28
+ # :similarity => 'DefaultAAFSimilarity' # the similarity implementation to use (the default
29
+ # equals Ferret's internal similarity implementation)
30
+ # :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
31
+ # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
32
+ # ferret_options : Ferret options handed over to find_with_ferret (i.e. for limits and sorting)
33
+ # ar_options : options handed over to find_with_ferret for AR scoping
34
+ def more_like_this(options = {}, ferret_options = {}, ar_options = {})
35
+ options = {
36
+ :field_names => nil, # Default field names
37
+ :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
38
+ :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
39
+ :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
40
+ :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
41
+ :max_query_terms => 25, # maximum number of terms in the query built
42
+ :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
43
+ :boost => false,
44
+ :similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity', # class name of the similarity implementation to use
45
+ :analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
46
+ :append_to_query => nil,
47
+ :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_with_ferret can be used to retrieve results from other classes, too
48
+ }.update(options)
49
+ #index.search_each('id:*') do |doc, score|
50
+ # puts "#{doc} == #{index[doc][:description]}"
51
+ #end
52
+ clazz = options[:base_class]
53
+ options[:base_class] = clazz.name
54
+ query = clazz.aaf_index.build_more_like_this_query(self.ferret_key, self.id, options)
55
+ options[:append_to_query].call(query) if options[:append_to_query]
56
+ clazz.find_with_ferret(query, ferret_options, ar_options)
57
+ end
58
+
59
+ end
60
+
61
+ module IndexMethods
62
+
63
+ # TODO to allow morelikethis for unsaved records, we have to give the
64
+ # unsaved record's data to this method. check how this will work out
65
+ # via drb...
66
+ def build_more_like_this_query(key, id, options)
67
+ [:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
68
+ ferret_index.synchronize do # avoid that concurrent writes close our reader
69
+ ferret_index.send(:ensure_reader_open)
70
+ reader = ferret_index.send(:reader)
71
+ term_freq_map = retrieve_terms(key, id, reader, options)
72
+ priority_queue = create_queue(term_freq_map, reader, options)
73
+ create_query(key, priority_queue, options)
74
+ end
75
+ end
76
+
77
+ protected
78
+
79
+ def create_query(key, priority_queue, options={})
80
+ query = Ferret::Search::BooleanQuery.new
81
+ qterms = 0
82
+ best_score = nil
83
+ while(cur = priority_queue.pop)
84
+ term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
85
+
86
+ if options[:boost]
87
+ # boost term according to relative score
88
+ # TODO untested
89
+ best_score ||= cur.score
90
+ term_query.boost = cur.score / best_score
91
+ end
92
+ begin
93
+ query.add_query(term_query, :should)
94
+ rescue Ferret::Search::BooleanQuery::TooManyClauses
95
+ break
96
+ end
97
+ qterms += 1
98
+ break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
99
+ end
100
+ # exclude the original record
101
+ query.add_query(query_for_record(key), :must_not)
102
+ return query
103
+ end
104
+
105
+
106
+
107
+ # creates a term/term_frequency map for terms from the fields
108
+ # given in options[:field_names]
109
+ def retrieve_terms(key, id, reader, options)
110
+ raise "more_like_this atm only works on saved records" if key.nil?
111
+ document_number = document_number(key) rescue nil
112
+ field_names = options[:field_names]
113
+ max_num_tokens = options[:max_num_tokens]
114
+ term_freq_map = Hash.new(0)
115
+ doc = nil
116
+ record = nil
117
+ field_names.each do |field|
118
+ #puts "field: #{field}"
119
+ term_freq_vector = reader.term_vector(document_number, field) if document_number
120
+ #if false
121
+ if term_freq_vector
122
+ # use stored term vector
123
+ # puts 'using stored term vector'
124
+ term_freq_vector.terms.each do |term|
125
+ term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
126
+ end
127
+ else
128
+ # puts 'no stored term vector'
129
+ # no term vector stored, but we have stored the contents in the index
130
+ # -> extract terms from there
131
+ content = nil
132
+ if document_number
133
+ doc = reader[document_number]
134
+ content = doc[field]
135
+ end
136
+ unless content
137
+ # no term vector, no stored content, so try content from this instance
138
+ record ||= options[:base_class].constantize.find(id)
139
+ content = record.content_for_field_name(field.to_s)
140
+ end
141
+ puts "have doc: #{doc[:id]} with #{field} == #{content}"
142
+ token_count = 0
143
+
144
+ ts = options[:analyzer].token_stream(field, content)
145
+ while token = ts.next
146
+ break if (token_count+=1) > max_num_tokens
147
+ next if noise_word?(token.text, options)
148
+ term_freq_map[token.text] += 1
149
+ end
150
+ end
151
+ end
152
+ term_freq_map
153
+ end
154
+
155
+ # create an ordered(by score) list of word,fieldname,score
156
+ # structures
157
+ def create_queue(term_freq_map, reader, options)
158
+ pq = Array.new(term_freq_map.size)
159
+
160
+ similarity = options[:similarity]
161
+ num_docs = reader.num_docs
162
+ term_freq_map.each_pair do |word, tf|
163
+ # filter out words that don't occur enough times in the source
164
+ next if options[:min_term_freq] && tf < options[:min_term_freq]
165
+
166
+ # go through all the fields and find the largest document frequency
167
+ top_field = options[:field_names].first
168
+ doc_freq = 0
169
+ options[:field_names].each do |field_name|
170
+ freq = reader.doc_freq(field_name, word)
171
+ if freq > doc_freq
172
+ top_field = field_name
173
+ doc_freq = freq
174
+ end
175
+ end
176
+ # filter out words that don't occur in enough docs
177
+ next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
178
+ next if doc_freq == 0 # index update problem ?
179
+
180
+ idf = similarity.idf(doc_freq, num_docs)
181
+ score = tf * idf
182
+ pq << FrequencyQueueItem.new(word, top_field, score)
183
+ end
184
+ pq.compact!
185
+ pq.sort! { |a,b| a.score<=>b.score }
186
+ return pq
187
+ end
188
+
189
+ def noise_word?(text, options)
190
+ len = text.length
191
+ (
192
+ (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
193
+ (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
194
+ (options[:stop_words] && options.include?(text))
195
+ )
196
+ end
197
+
198
+ end
199
+
200
+ class DefaultAAFSimilarity
201
+ def idf(doc_freq, num_docs)
202
+ return 0.0 if num_docs == 0
203
+ return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
204
+ end
205
+ end
206
+
207
+
208
+ class FrequencyQueueItem
209
+ attr_reader :word, :field, :score
210
+ def initialize(word, field, score)
211
+ @word = word; @field = field; @score = score
212
+ end
213
+ end
214
+
215
+ end
216
+ end
217
+