vss 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/Manifest +7 -0
- data/README.md +41 -0
- data/Rakefile +9 -0
- data/lib/vss/engine.rb +102 -0
- data/lib/vss/tokenizer.rb +17 -0
- data/lib/vss.rb +1 -0
- data/vss.gemspec +33 -0
- metadata +83 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Mark Dodwell, mkdynamic
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest
ADDED
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# VSS - Vector Space Search
|
2
|
+
|
3
|
+
A simple vector space search engine with **tf*idf** ranking.
|
4
|
+
|
5
|
+
## Install
|
6
|
+
|
7
|
+
Just install the gem:
|
8
|
+
|
9
|
+
gem install vss
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
To perform a search on a collection of documents:
|
14
|
+
|
15
|
+
require "vss"
|
16
|
+
docs = ["hello", "goodbye", "hello and goodbye", "hello, hello!"]
|
17
|
+
engine = VSS::Engine.new(docs)
|
18
|
+
engine.search("hello") #=> ["hello", "hello, hello!", "hello and goodbye", "goodbye"]
|
19
|
+
|
20
|
+
## Rails/ActiveRecord
|
21
|
+
|
22
|
+
If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `proc` when initializing `VSS::Engine` which will convert the objects into documents (which are simply strings). For example:
|
23
|
+
|
24
|
+
class Page < ActiveRecord::Base
|
25
|
+
#attrs: title, content
|
26
|
+
end
|
27
|
+
|
28
|
+
docs = Page.all
|
29
|
+
documentizer = proc { |record| record.title + " " + record.content }
|
30
|
+
engine = VSS::Engine.new(docs, documentizer)
|
31
|
+
|
32
|
+
## Notes
|
33
|
+
|
34
|
+
This isn't designed to be used on huge collections of records. The original use case was for ranking a smallish set of `ActiveRecord` results obtained via a query (using **SearchLogic**). So, essentially, the search consisted of 2 stages; getting the *corpus* via a SQL query, then doing the VSS on that.
|
35
|
+
|
36
|
+
## Credits
|
37
|
+
|
38
|
+
Heavily inspired by [Joesph Wilk's article on building a vector space search engine in Perl](http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html).
|
39
|
+
|
40
|
+
Written by Mark Dodwell
|
41
|
+
([Design & Code](http://madeofcode.com))
|
data/Rakefile
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require "echoe"
|
2
|
+
|
3
|
+
Echoe.new("vss", "0.1.0") do |p|
|
4
|
+
p.description = "Simple vector space search engine"
|
5
|
+
p.url = "http://github.com/mkdynamic/vss"
|
6
|
+
p.author = "Mark Dodwell"
|
7
|
+
p.email = "labs@mkdynamic.co.uk"
|
8
|
+
p.runtime_dependencies = ["stemmer >= 1.0.1"]
|
9
|
+
end
|
data/lib/vss/engine.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require "matrix"
|
2
|
+
require "vss/tokenizer"
|
3
|
+
|
4
|
+
module VSS
|
5
|
+
class Engine
|
6
|
+
# `documentizer` just takes a record and converts it to a string
|
7
|
+
def initialize(records, documentizer = proc { |document| document })
|
8
|
+
@records = records
|
9
|
+
@documents = records.map { |record| documentizer.call(record) }
|
10
|
+
@vocab = tokenize(@documents.join(" "))
|
11
|
+
end
|
12
|
+
|
13
|
+
def search(query)
|
14
|
+
# get ranks
|
15
|
+
query_vector = make_query_vector(query)
|
16
|
+
ranks = @documents.map do |document|
|
17
|
+
document_vector = make_vector(document)
|
18
|
+
cosine_rank(query_vector, document_vector)
|
19
|
+
end
|
20
|
+
|
21
|
+
# now annotate records and return them
|
22
|
+
@records.each_with_index do |record, i|
|
23
|
+
# TODO: do this in a sensible way...
|
24
|
+
record.instance_eval %{def rank; #{ranks[i]}; end}
|
25
|
+
end
|
26
|
+
|
27
|
+
# sort by rank and return
|
28
|
+
@records.sort { |a,b| b.rank <=> a.rank } # highest to lowest
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# ranks from 0 to 100
|
34
|
+
def cosine_rank(vector1, vector2)
|
35
|
+
(cosine(vector1, vector2) + 1) / 2 * 100
|
36
|
+
end
|
37
|
+
|
38
|
+
# see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
|
39
|
+
# and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
|
40
|
+
# will be in range -1 to 1
|
41
|
+
def cosine(vector1, vector2)
|
42
|
+
dot_product = vector1.inner_product(vector2)
|
43
|
+
dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
|
44
|
+
end
|
45
|
+
|
46
|
+
def make_query_vector(query)
|
47
|
+
make_vector(query, true)
|
48
|
+
end
|
49
|
+
|
50
|
+
# NOTE: will choke if string contains words not in vocab
|
51
|
+
# this is why, when we make the query vector, we do an
|
52
|
+
# intersection of tokens with the vocab
|
53
|
+
def make_vector(string, ensure_words_in_vocab = false)
|
54
|
+
@vector_cache = {}
|
55
|
+
@vector_cache[string] ||= begin
|
56
|
+
arr = Array.new(vector_keyword_index.size, 0)
|
57
|
+
|
58
|
+
# uses tf*idf (http://en.wikipedia.org/wiki/Tf-idf)
|
59
|
+
words = tokenize(string)
|
60
|
+
words &= @vocab if ensure_words_in_vocab
|
61
|
+
words.uniq.each do |word|
|
62
|
+
tf = count_in_array(words, word)
|
63
|
+
idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
|
64
|
+
|
65
|
+
index = vector_keyword_index[word]
|
66
|
+
arr[index] = tf * idf
|
67
|
+
end
|
68
|
+
|
69
|
+
Vector.elements(arr, false)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def vector_keyword_index
|
74
|
+
@vector_keyword_index ||= begin
|
75
|
+
index, offset = {}, 0
|
76
|
+
|
77
|
+
@vocab.each do |keyword|
|
78
|
+
index[keyword] = offset
|
79
|
+
offset += 1
|
80
|
+
end
|
81
|
+
|
82
|
+
index
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def tokenize(string)
|
87
|
+
@tokenize_cache ||= {}
|
88
|
+
@tokenize_cache[string] ||= Tokenizer.tokenize(string)
|
89
|
+
end
|
90
|
+
|
91
|
+
# could use Array#count, but 1.8.6 on Heroku don't have it only 1.8.7 >
|
92
|
+
def count_in_array(array, item)
|
93
|
+
count = 0
|
94
|
+
if item.is_a? Proc
|
95
|
+
array.each { |i| count += 1 if item.call(i) }
|
96
|
+
else
|
97
|
+
array.each { |i| count += 1 if i == item }
|
98
|
+
end
|
99
|
+
count
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "stemmer"
|
2
|
+
|
3
|
+
module VSS
|
4
|
+
class Tokenizer
|
5
|
+
STOP_WORDS = %w[
|
6
|
+
a b c d e f g h i j k l m n o p q r s t u v w x y z
|
7
|
+
an and are as at be by for from has he in is it its
|
8
|
+
of on that the to was were will with upon without among
|
9
|
+
]
|
10
|
+
|
11
|
+
def self.tokenize(string)
|
12
|
+
stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") # remove punctuation
|
13
|
+
words = stripped.split(/\s+/).reject(&:blank?).map(&:downcase).map(&:stem)
|
14
|
+
words.reject { |word| STOP_WORDS.include?(word) }.uniq
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/vss.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "vss/engine"
|
data/vss.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{vss}
|
5
|
+
s.version = "0.1.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Mark Dodwell"]
|
9
|
+
s.date = %q{2010-03-10}
|
10
|
+
s.description = %q{Simple vector space search engine}
|
11
|
+
s.email = %q{labs@mkdynamic.co.uk}
|
12
|
+
s.extra_rdoc_files = ["LICENSE", "README.md", "lib/vss.rb", "lib/vss/engine.rb", "lib/vss/tokenizer.rb"]
|
13
|
+
s.files = ["LICENSE", "Manifest", "README.md", "Rakefile", "lib/vss.rb", "lib/vss/engine.rb", "lib/vss/tokenizer.rb", "vss.gemspec"]
|
14
|
+
s.homepage = %q{http://github.com/mkdynamic/vss}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Vss", "--main", "README.md"]
|
16
|
+
s.require_paths = ["lib"]
|
17
|
+
s.rubyforge_project = %q{vss}
|
18
|
+
s.rubygems_version = %q{1.3.5}
|
19
|
+
s.summary = %q{Simple vector space search engine}
|
20
|
+
|
21
|
+
if s.respond_to? :specification_version then
|
22
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
23
|
+
s.specification_version = 3
|
24
|
+
|
25
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
26
|
+
s.add_runtime_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
|
27
|
+
else
|
28
|
+
s.add_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
|
29
|
+
end
|
30
|
+
else
|
31
|
+
s.add_dependency(%q<stemmer>, [">= 0", "= 1.0.1"])
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vss
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Mark Dodwell
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-10 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: stemmer
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
- - "="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.0.1
|
27
|
+
version:
|
28
|
+
description: Simple vector space search engine
|
29
|
+
email: labs@mkdynamic.co.uk
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files:
|
35
|
+
- LICENSE
|
36
|
+
- README.md
|
37
|
+
- lib/vss.rb
|
38
|
+
- lib/vss/engine.rb
|
39
|
+
- lib/vss/tokenizer.rb
|
40
|
+
files:
|
41
|
+
- LICENSE
|
42
|
+
- Manifest
|
43
|
+
- README.md
|
44
|
+
- Rakefile
|
45
|
+
- lib/vss.rb
|
46
|
+
- lib/vss/engine.rb
|
47
|
+
- lib/vss/tokenizer.rb
|
48
|
+
- vss.gemspec
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: http://github.com/mkdynamic/vss
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options:
|
55
|
+
- --line-numbers
|
56
|
+
- --inline-source
|
57
|
+
- --title
|
58
|
+
- Vss
|
59
|
+
- --main
|
60
|
+
- README.md
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "1.2"
|
74
|
+
version:
|
75
|
+
requirements: []
|
76
|
+
|
77
|
+
rubyforge_project: vss
|
78
|
+
rubygems_version: 1.3.5
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Simple vector space search engine
|
82
|
+
test_files: []
|
83
|
+
|