acts_as_ferret 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,203 @@
1
+ module FerretMixin
2
+ module Acts #:nodoc:
3
+ module ARFerret #:nodoc:
4
+
5
+ module MoreLikeThis
6
+
7
+ class DefaultAAFSimilarity
8
+ def idf(doc_freq, num_docs)
9
+ return 0.0 if num_docs == 0
10
+ return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
11
+ end
12
+ end
13
+
14
+ # returns other instances of this class, which have similar contents
15
+ # like this one. Basically works like this: find out n most interesting
16
+ # (i.e. characteristic) terms from this document, and then build a
17
+ # query from those which is run against the whole index. Which terms
18
+ # are interesting is decided on variour criteria which can be
19
+ # influenced by the given options.
20
+ #
21
+ # The algorithm used here is a quite straight port of the MoreLikeThis class
22
+ # from Apache Lucene.
23
+ #
24
+ # options are:
25
+ # :field_names : Array of field names to use for similarity search (mandatory)
26
+ # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
27
+ # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
28
+ # :min_word_length => nil, # Ignore words if less than this len (longer
29
+ # words tend to be more characteristic for the document they occur in).
30
+ # :max_word_length => nil, # Ignore words if greater than this len.
31
+ # :max_query_terms => 25, # maximum number of terms in the query built
32
+ # :max_num_tokens => 5000, # maximum number of tokens to examine in a
33
+ # single field
34
+ # :boost => false, # when true, a boost according to the
35
+ # relative score of a term is applied to this Term's TermQuery.
36
+ # :similarity => Ferret::Search::Similarity.default, # the similarity
37
+ # implementation to use
38
+ # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
39
+ # use
40
+ # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
41
+ # find_options : options handed over to find_by_contents
42
+ def more_like_this(options = {}, find_options = {})
43
+ options = {
44
+ :field_names => nil, # Default field names
45
+ :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
46
+ :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
47
+ :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
48
+ :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
49
+ :max_query_terms => 25, # maximum number of terms in the query built
50
+ :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
51
+ :boost => false,
52
+ :similarity => DefaultAAFSimilarity.new,
53
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
54
+ :append_to_query => nil,
55
+ :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
56
+ }.update(options)
57
+ index = self.class.ferret_index
58
+ #index.search_each('id:*') do |doc, score|
59
+ # puts "#{doc} == #{index[doc][:description]}"
60
+ #end
61
+ index.synchronize do # avoid that concurrent writes close our reader
62
+ index.send(:ensure_reader_open)
63
+ reader = index.send(:reader)
64
+ doc_number = self.document_number
65
+ term_freq_map = retrieve_terms(document_number, reader, options)
66
+ priority_queue = create_queue(term_freq_map, reader, options)
67
+ query = create_query(priority_queue, options)
68
+ logger.debug "morelikethis-query: #{query}"
69
+ options[:append_to_query].call(query) if options[:append_to_query]
70
+ options[:base_class].find_by_contents(query, find_options)
71
+ end
72
+ end
73
+
74
+
75
+ def create_query(priority_queue, options={})
76
+ query = Ferret::Search::BooleanQuery.new
77
+ qterms = 0
78
+ best_score = nil
79
+ while(cur = priority_queue.pop)
80
+ term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
81
+
82
+ if options[:boost]
83
+ # boost term according to relative score
84
+ # TODO untested
85
+ best_score ||= cur.score
86
+ term_query.boost = cur.score / best_score
87
+ end
88
+ begin
89
+ query.add_query(term_query, :should)
90
+ rescue Ferret::Search::BooleanQuery::TooManyClauses
91
+ break
92
+ end
93
+ qterms += 1
94
+ break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
95
+ end
96
+ # exclude ourselves
97
+ query.add_query(Ferret::Search::TermQuery.new(:id, self.id.to_s), :must_not)
98
+ return query
99
+ end
100
+
101
+
102
+
103
+ # creates a term/term_frequency map for terms from the fields
104
+ # given in options[:field_names]
105
+ def retrieve_terms(doc_number, reader, options)
106
+ field_names = options[:field_names]
107
+ max_num_tokens = options[:max_num_tokens]
108
+ term_freq_map = Hash.new(0)
109
+ doc = nil
110
+ field_names.each do |field|
111
+ #puts "field: #{field}"
112
+ term_freq_vector = reader.term_vector(document_number, field)
113
+ #if false
114
+ if term_freq_vector
115
+ # use stored term vector
116
+ # puts 'using stored term vector'
117
+ term_freq_vector.terms.each do |term|
118
+ term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
119
+ end
120
+ else
121
+ # puts 'no stored term vector'
122
+ # no term vector stored, but we have stored the contents in the index
123
+ # -> extract terms from there
124
+ doc = reader[doc_number]
125
+ content = doc[field]
126
+ unless content
127
+ # no term vector, no stored content, so try content from this instance
128
+ content = content_for_field_name(field.to_s)
129
+ end
130
+ puts "have doc: #{doc[:id]} with #{field} == #{content}"
131
+ token_count = 0
132
+
133
+ ts = options[:analyzer].token_stream(field, content)
134
+ while token = ts.next
135
+ break if (token_count+=1) > max_num_tokens
136
+ next if noise_word?(token.text, options)
137
+ term_freq_map[token.text] += 1
138
+ end
139
+ end
140
+ end
141
+ term_freq_map
142
+ end
143
+
144
+ # create an ordered(by score) list of word,fieldname,score
145
+ # structures
146
+ def create_queue(term_freq_map, reader, options)
147
+ pq = Array.new(term_freq_map.size)
148
+
149
+ similarity = options[:similarity]
150
+ num_docs = reader.num_docs
151
+ term_freq_map.each_pair do |word, tf|
152
+ # filter out words that don't occur enough times in the source
153
+ next if options[:min_term_freq] && tf < options[:min_term_freq]
154
+
155
+ # go through all the fields and find the largest document frequency
156
+ top_field = options[:field_names].first
157
+ doc_freq = 0
158
+ options[:field_names].each do |field_name|
159
+ freq = reader.doc_freq(field_name, word)
160
+ if freq > doc_freq
161
+ top_field = field_name
162
+ doc_freq = freq
163
+ end
164
+ end
165
+ # filter out words that don't occur in enough docs
166
+ next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
167
+ next if doc_freq == 0 # index update problem ?
168
+
169
+ idf = similarity.idf(doc_freq, num_docs)
170
+ score = tf * idf
171
+ pq << FrequencyQueueItem.new(word, top_field, score)
172
+ end
173
+ pq.compact!
174
+ pq.sort! { |a,b| a.score<=>b.score }
175
+ return pq
176
+ end
177
+
178
+ def noise_word?(text, options)
179
+ len = text.length
180
+ (
181
+ (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
182
+ (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
183
+ (options[:stop_words] && options.include?(text))
184
+ )
185
+ end
186
+
187
+ def content_for_field_name(field)
188
+ self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym)
189
+ end
190
+
191
+ end
192
+
193
+ class FrequencyQueueItem
194
+ attr_reader :word, :field, :score
195
+ def initialize(word, field, score)
196
+ @word = word; @field = field; @score = score
197
+ end
198
+ end
199
+
200
+ end
201
+ end
202
+ end
203
+
@@ -0,0 +1,87 @@
1
+ module FerretMixin
2
+ module Acts #:nodoc:
3
+ module ARFerret #:nodoc:
4
+ # not threadsafe
5
+ class MultiIndex
6
+
7
+ # todo: check for necessary index rebuilds in this place, too
8
+ # idea - each class gets a create_reader method that does this
9
+ def initialize(model_classes, options = {})
10
+ @model_classes = model_classes
11
+ default_fields = @model_classes.inject([]) do |fields, c|
12
+ fields + c.ferret_configuration[:default_field]
13
+ end
14
+ @options = {
15
+ :default_field => default_fields
16
+ }.update(options)
17
+ end
18
+
19
+ def search(query, options={})
20
+ #puts "querystring: #{query.to_s}"
21
+ query = process_query(query)
22
+ #puts "parsed query: #{query.to_s}"
23
+ searcher.search(query, options)
24
+ end
25
+
26
+ def search_each(query, options = {}, &block)
27
+ query = process_query(query)
28
+ searcher.search_each(query, options, &block)
29
+ end
30
+
31
+ # checks if all our sub-searchers still are up to date
32
+ def latest?
33
+ return false unless @reader
34
+ # segfaults with 0.10.4 --> TODO report as bug @reader.latest?
35
+ @sub_readers.each do |r|
36
+ return false unless r.latest?
37
+ end
38
+ true
39
+ end
40
+
41
+ def searcher
42
+ ensure_searcher
43
+ @searcher
44
+ end
45
+
46
+ def doc(i)
47
+ searcher[i]
48
+ end
49
+ alias :[] :doc
50
+
51
+ def query_parser
52
+ @query_parser ||= Ferret::QueryParser.new(@options)
53
+ end
54
+
55
+ def process_query(query)
56
+ query = query_parser.parse(query) if query.is_a?(String)
57
+ return query
58
+ end
59
+
60
+ def close
61
+ @searcher.close if @searcher
62
+ @reader.close if @reader
63
+ end
64
+
65
+ protected
66
+
67
+ def ensure_searcher
68
+ unless latest?
69
+ @sub_readers = @model_classes.map { |clazz|
70
+ begin
71
+ reader = Ferret::Index::IndexReader.new(clazz.class_index_dir)
72
+ rescue Exception
73
+ puts "error opening #{clazz.class_index_dir}: #{$!}"
74
+ end
75
+ reader
76
+ }
77
+ close
78
+ @reader = Ferret::Index::IndexReader.new(@sub_readers)
79
+ @searcher = Ferret::Search::Searcher.new(@reader)
80
+ end
81
+ end
82
+
83
+ end # of class MultiIndex
84
+
85
+ end
86
+ end
87
+ end
data/rakefile ADDED
@@ -0,0 +1,191 @@
1
+ # rakefile for acts_as_ferret.
2
+ # use to create a gem or generate rdoc api documentation.
3
+ #
4
+ # heavily based on the one from the acts_as_searchable plugin.
5
+
6
+ require 'rake'
7
+ require 'rake/rdoctask'
8
+ require 'rake/packagetask'
9
+ require 'rake/gempackagetask'
10
+ require 'rake/testtask'
11
+ require 'rake/contrib/rubyforgepublisher'
12
+
13
+ PKG_NAME = 'acts_as_ferret'
14
+ PKG_VERSION = '0.3.1'
15
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
16
+ RUBY_FORGE_PROJECT = 'actsasferret'
17
+ RUBY_FORGE_USER = 'jkraemer'
18
+
19
+ desc 'Default: run unit tests.'
20
+ task :default => :test
21
+
22
+ desc 'Test the acts_as_searchable plugin.'
23
+ Rake::TestTask.new(:test) do |t|
24
+ t.libs << 'lib'
25
+ t.pattern = 'test/**/*_test.rb'
26
+ t.verbose = true
27
+ end
28
+
29
+ desc 'Generate documentation for the acts_as_ferret plugin.'
30
+ Rake::RDocTask.new(:rdoc) do |rdoc|
31
+ rdoc.rdoc_dir = 'html'
32
+ rdoc.title = "acts_as_ferret - Ferret based full text search for any ActiveRecord model"
33
+ rdoc.options << '--line-numbers' << '--inline-source'
34
+ rdoc.options << '--main' << 'README'
35
+ rdoc.rdoc_files.include('README', 'LICENSE')
36
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
37
+ rdoc.rdoc_files.include('lib/**/*.rb')
38
+ end
39
+
40
+ spec = Gem::Specification.new do |s|
41
+ s.name = PKG_NAME
42
+ s.version = PKG_VERSION
43
+ s.platform = Gem::Platform::RUBY
44
+ s.summary = "acts_as_ferret - Ferret based full text search for any ActiveRecord model"
45
+ s.files = Dir.glob('**/*', File::FNM_DOTMATCH).reject do |f|
46
+ [ /\.$/, /sqlite$/, /\.log$/, /^pkg/, /\.svn/,
47
+ /\~$/, /\/\._/, /\/#/ ].any? {|regex| f =~ regex }
48
+ end
49
+ #s.files = FileList["{lib,test}/**/*"].to_a + %w(README MIT-LICENSE CHANGELOG)
50
+ # s.files.delete ...
51
+ s.require_path = 'lib'
52
+ s.autorequire = 'acts_as_ferret'
53
+ s.has_rdoc = true
54
+ # s.test_files = Dir['test/**/*_test.rb']
55
+ s.author = "Jens Kraemer"
56
+ s.email = "jk@jkraemer.net"
57
+ s.homepage = "http://projects.jkraemer.net/acts_as_ferret"
58
+ end
59
+
60
+ Rake::GemPackageTask.new(spec) do |pkg|
61
+ pkg.need_tar = true
62
+ end
63
+
64
+ desc "Publish the API documentation"
65
+ task :pdoc => [:rdoc] do
66
+ Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
67
+ end
68
+
69
+ desc 'Publish the gem and API docs'
70
+ task :publish => [:pdoc, :rubyforge_upload]
71
+
72
+ desc "Publish the release files to RubyForge."
73
+ task :rubyforge_upload => :package do
74
+ files = %w(gem tgz).map { |ext| "pkg/#{PKG_FILE_NAME}.#{ext}" }
75
+
76
+ if RUBY_FORGE_PROJECT then
77
+ require 'net/http'
78
+ require 'open-uri'
79
+
80
+ project_uri = "http://rubyforge.org/projects/#{RUBY_FORGE_PROJECT}/"
81
+ project_data = open(project_uri) { |data| data.read }
82
+ group_id = project_data[/[?&]group_id=(\d+)/, 1]
83
+ raise "Couldn't get group id" unless group_id
84
+
85
+ # This echos password to shell which is a bit sucky
86
+ if ENV["RUBY_FORGE_PASSWORD"]
87
+ password = ENV["RUBY_FORGE_PASSWORD"]
88
+ else
89
+ print "#{RUBY_FORGE_USER}@rubyforge.org's password: "
90
+ password = STDIN.gets.chomp
91
+ end
92
+
93
+ login_response = Net::HTTP.start("rubyforge.org", 80) do |http|
94
+ data = [
95
+ "login=Login",
96
+ "form_loginname=#{RUBY_FORGE_USER}",
97
+ "form_pw=#{password}"
98
+ ].join("&")
99
+
100
+ headers = { 'Content-Type' => 'application/x-www-form-urlencoded' }
101
+
102
+ http.post("/account/login.php", data, headers)
103
+ end
104
+
105
+ cookie = login_response["set-cookie"]
106
+ raise "Login failed" unless cookie
107
+ headers = { "Cookie" => cookie }
108
+
109
+ release_uri = "http://rubyforge.org/frs/admin/?group_id=#{group_id}"
110
+ release_data = open(release_uri, headers) { |data| data.read }
111
+ package_id = release_data[/[?&]package_id=(\d+)/, 1]
112
+ raise "Couldn't get package id" unless package_id
113
+
114
+ first_file = true
115
+ release_id = ""
116
+
117
+ files.each do |filename|
118
+ basename = File.basename(filename)
119
+ file_ext = File.extname(filename)
120
+ file_data = File.open(filename, "rb") { |file| file.read }
121
+
122
+ puts "Releasing #{basename}..."
123
+
124
+ release_response = Net::HTTP.start("rubyforge.org", 80) do |http|
125
+ release_date = Time.now.strftime("%Y-%m-%d %H:%M")
126
+ type_map = {
127
+ ".zip" => "3000",
128
+ ".tgz" => "3110",
129
+ ".gz" => "3110",
130
+ ".gem" => "1400"
131
+ }; type_map.default = "9999"
132
+ type = type_map[file_ext]
133
+ boundary = "rubyqMY6QN9bp6e4kS21H4y0zxcvoor"
134
+
135
+ query_hash = if first_file then
136
+ {
137
+ "group_id" => group_id,
138
+ "package_id" => package_id,
139
+ "release_name" => PKG_FILE_NAME,
140
+ "release_date" => release_date,
141
+ "type_id" => type,
142
+ "processor_id" => "8000", # Any
143
+ "release_notes" => "",
144
+ "release_changes" => "",
145
+ "preformatted" => "1",
146
+ "submit" => "1"
147
+ }
148
+ else
149
+ {
150
+ "group_id" => group_id,
151
+ "release_id" => release_id,
152
+ "package_id" => package_id,
153
+ "step2" => "1",
154
+ "type_id" => type,
155
+ "processor_id" => "8000", # Any
156
+ "submit" => "Add This File"
157
+ }
158
+ end
159
+
160
+ data = [
161
+ "--" + boundary,
162
+ "Content-Disposition: form-data; name=\"userfile\"; filename=\"#{basename}\"",
163
+ "Content-Type: application/octet-stream",
164
+ "Content-Transfer-Encoding: binary",
165
+ "", file_data, "",
166
+ query_hash.collect do |name, value|
167
+ [ "--" + boundary,
168
+ "Content-Disposition: form-data; name='#{name}'",
169
+ "", value, "" ]
170
+ end
171
+ ].flatten.join("\x0D\x0A")
172
+
173
+ release_headers = headers.merge(
174
+ "Content-Type" => "multipart/form-data; boundary=#{boundary}"
175
+ )
176
+
177
+ target = first_file ? "/frs/admin/qrs.php" : "/frs/admin/editrelease.php"
178
+ http.post(target, data, release_headers)
179
+ end
180
+
181
+ if first_file then
182
+ release_id = release_response.body[/release_id=(\d+)/, 1]
183
+ raise("Couldn't get release id") unless release_id
184
+ end
185
+
186
+ first_file = false
187
+ end
188
+ end
189
+ end
190
+
191
+
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: acts_as_ferret
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.3.1
7
+ date: 2007-01-20 00:00:00 +01:00
8
+ summary: acts_as_ferret - Ferret based full text search for any ActiveRecord model
9
+ require_paths:
10
+ - lib
11
+ email: jk@jkraemer.net
12
+ homepage: http://projects.jkraemer.net/acts_as_ferret
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: acts_as_ferret
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Jens Kraemer
30
+ files:
31
+ - LICENSE
32
+ - rakefile
33
+ - init.rb
34
+ - lib
35
+ - README
36
+ - .rakefile.swp
37
+ - .init.rb.swp
38
+ - lib/multi_index.rb
39
+ - lib/acts_as_ferret.rb
40
+ - lib/instance_methods.rb
41
+ - lib/class_methods.rb
42
+ - lib/more_like_this.rb
43
+ - lib/.class_methods.rb.swp
44
+ - lib/.acts_as_ferret.rb.swp
45
+ - lib/.class_methods.rb.swo
46
+ test_files: []
47
+
48
+ rdoc_options: []
49
+
50
+ extra_rdoc_files: []
51
+
52
+ executables: []
53
+
54
+ extensions: []
55
+
56
+ requirements: []
57
+
58
+ dependencies: []
59
+