vss 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Mark Dodwell, mkdynamic
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest ADDED
@@ -0,0 +1,7 @@
1
+ LICENSE
2
+ Manifest
3
+ README.md
4
+ Rakefile
5
+ lib/vss.rb
6
+ lib/vss/engine.rb
7
+ lib/vss/tokenizer.rb
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # VSS - Vector Space Search
2
+
3
+ A simple vector space search engine with **tf*idf** ranking.
4
+
5
+ ## Install
6
+
7
+ Just install the gem:
8
+
9
+ gem install vss
10
+
11
+ ## Usage
12
+
13
+ To perform a search on a collection of documents:
14
+
15
+ require "vss"
16
+ docs = ["hello", "goodbye", "hello and goodbye", "hello, hello!"]
17
+ engine = VSS::Engine.new(docs)
18
+ engine.search("hello") #=> ["hello", "hello, hello!", "hello and goodbye", "goodbye"]
19
+
20
+ ## Rails/ActiveRecord
21
+
22
+ If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `proc` when initializing `VSS::Engine` which will convert the objects into documents (which are simply strings). For example:
23
+
24
+ class Page < ActiveRecord::Base
25
+ #attrs: title, content
26
+ end
27
+
28
+ docs = Page.all
29
+ documentizer = proc { |record| record.title + " " + record.content }
30
+ engine = VSS::Engine.new(docs, documentizer)
31
+
32
+ ## Notes
33
+
34
+ This isn't designed to be used on huge collections of records. The original use case was for ranking a smallish set of `ActiveRecord` results obtained via a query (using **SearchLogic**). So, essentially, the search consisted of 2 stages; getting the *corpus* via a SQL query, then doing the VSS on that.
35
+
36
+ ## Credits
37
+
38
+ Heavily inspired by [Joesph Wilk's article on building a vector space search engine in Perl](http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html).
39
+
40
+ Written by Mark Dodwell
41
+ ([Design & Code](http://madeofcode.com))
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "echoe"
2
+
3
+ Echoe.new("vss", "0.1.0") do |p|
4
+ p.description = "Simple vector space search engine"
5
+ p.url = "http://github.com/mkdynamic/vss"
6
+ p.author = "Mark Dodwell"
7
+ p.email = "labs@mkdynamic.co.uk"
8
+ p.runtime_dependencies = ["stemmer >= 1.0.1"]
9
+ end
data/lib/vss/engine.rb ADDED
@@ -0,0 +1,102 @@
1
+ require "matrix"
2
+ require "vss/tokenizer"
3
+
4
+ module VSS
5
+ class Engine
6
+ # `documentizer` just takes a record and converts it to a string
7
+ def initialize(records, documentizer = proc { |document| document })
8
+ @records = records
9
+ @documents = records.map { |record| documentizer.call(record) }
10
+ @vocab = tokenize(@documents.join(" "))
11
+ end
12
+
13
+ def search(query)
14
+ # get ranks
15
+ query_vector = make_query_vector(query)
16
+ ranks = @documents.map do |document|
17
+ document_vector = make_vector(document)
18
+ cosine_rank(query_vector, document_vector)
19
+ end
20
+
21
+ # now annotate records and return them
22
+ @records.each_with_index do |record, i|
23
+ # TODO: do this in a sensible way...
24
+ record.instance_eval %{def rank; #{ranks[i]}; end}
25
+ end
26
+
27
+ # sort by rank and return
28
+ @records.sort { |a,b| b.rank <=> a.rank } # highest to lowest
29
+ end
30
+
31
+ private
32
+
33
+ # ranks from 0 to 100
34
+ def cosine_rank(vector1, vector2)
35
+ (cosine(vector1, vector2) + 1) / 2 * 100
36
+ end
37
+
38
+ # see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
39
+ # and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
40
+ # will be in range -1 to 1
41
+ def cosine(vector1, vector2)
42
+ dot_product = vector1.inner_product(vector2)
43
+ dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
44
+ end
45
+
46
+ def make_query_vector(query)
47
+ make_vector(query, true)
48
+ end
49
+
50
+ # NOTE: will choke if string contains words not in vocab
51
+ # this is why, when we make the query vector, we do an
52
+ # intersection of tokens with the vocab
53
+ def make_vector(string, ensure_words_in_vocab = false)
54
+ @vector_cache = {}
55
+ @vector_cache[string] ||= begin
56
+ arr = Array.new(vector_keyword_index.size, 0)
57
+
58
+ # uses tf*idf (http://en.wikipedia.org/wiki/Tf-idf)
59
+ words = tokenize(string)
60
+ words &= @vocab if ensure_words_in_vocab
61
+ words.uniq.each do |word|
62
+ tf = count_in_array(words, word)
63
+ idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
64
+
65
+ index = vector_keyword_index[word]
66
+ arr[index] = tf * idf
67
+ end
68
+
69
+ Vector.elements(arr, false)
70
+ end
71
+ end
72
+
73
+ def vector_keyword_index
74
+ @vector_keyword_index ||= begin
75
+ index, offset = {}, 0
76
+
77
+ @vocab.each do |keyword|
78
+ index[keyword] = offset
79
+ offset += 1
80
+ end
81
+
82
+ index
83
+ end
84
+ end
85
+
86
+ def tokenize(string)
87
+ @tokenize_cache ||= {}
88
+ @tokenize_cache[string] ||= Tokenizer.tokenize(string)
89
+ end
90
+
91
+ # could use Array#count, but 1.8.6 on Heroku don't have it only 1.8.7 >
92
+ def count_in_array(array, item)
93
+ count = 0
94
+ if item.is_a? Proc
95
+ array.each { |i| count += 1 if item.call(i) }
96
+ else
97
+ array.each { |i| count += 1 if i == item }
98
+ end
99
+ count
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,17 @@
1
+ require "stemmer"
2
+
3
+ module VSS
4
+ class Tokenizer
5
+ STOP_WORDS = %w[
6
+ a b c d e f g h i j k l m n o p q r s t u v w x y z
7
+ an and are as at be by for from has he in is it its
8
+ of on that the to was were will with upon without among
9
+ ]
10
+
11
+ def self.tokenize(string)
12
+ stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") # remove punctuation
13
+ words = stripped.split(/\s+/).reject(&:blank?).map(&:downcase).map(&:stem)
14
+ words.reject { |word| STOP_WORDS.include?(word) }.uniq
15
+ end
16
+ end
17
+ end
data/lib/vss.rb ADDED
@@ -0,0 +1 @@
1
+ require "vss/engine"
data/vss.gemspec ADDED
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{vss}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Mark Dodwell"]
9
+ s.date = %q{2010-03-10}
10
+ s.description = %q{Simple vector space search engine}
11
+ s.email = %q{labs@mkdynamic.co.uk}
12
+ s.extra_rdoc_files = ["LICENSE", "README.md", "lib/vss.rb", "lib/vss/engine.rb", "lib/vss/tokenizer.rb"]
13
+ s.files = ["LICENSE", "Manifest", "README.md", "Rakefile", "lib/vss.rb", "lib/vss/engine.rb", "lib/vss/tokenizer.rb", "vss.gemspec"]
14
+ s.homepage = %q{http://github.com/mkdynamic/vss}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Vss", "--main", "README.md"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{vss}
18
+ s.rubygems_version = %q{1.3.5}
19
+ s.summary = %q{Simple vector space search engine}
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 3
24
+
25
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
26
+ s.add_runtime_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
27
+ else
28
+ s.add_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
29
+ end
30
+ else
31
+ s.add_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vss
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Mark Dodwell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-10 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: stemmer
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ - - "="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.0.1
27
+ version:
28
+ description: Simple vector space search engine
29
+ email: labs@mkdynamic.co.uk
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files:
35
+ - LICENSE
36
+ - README.md
37
+ - lib/vss.rb
38
+ - lib/vss/engine.rb
39
+ - lib/vss/tokenizer.rb
40
+ files:
41
+ - LICENSE
42
+ - Manifest
43
+ - README.md
44
+ - Rakefile
45
+ - lib/vss.rb
46
+ - lib/vss/engine.rb
47
+ - lib/vss/tokenizer.rb
48
+ - vss.gemspec
49
+ has_rdoc: true
50
+ homepage: http://github.com/mkdynamic/vss
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options:
55
+ - --line-numbers
56
+ - --inline-source
57
+ - --title
58
+ - Vss
59
+ - --main
60
+ - README.md
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "1.2"
74
+ version:
75
+ requirements: []
76
+
77
+ rubyforge_project: vss
78
+ rubygems_version: 1.3.5
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Simple vector space search engine
82
+ test_files: []
83
+