vss 0.1.4 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +2 -0
- data/LICENSE +2 -2
- data/README.md +5 -4
- data/Rakefile +4 -9
- data/lib/vss/engine.rb +12 -18
- data/lib/vss/tokenizer.rb +6 -7
- data/lib/vss/version.rb +3 -0
- data/lib/vss.rb +2 -1
- data/test/{search_test.rb → test.rb} +18 -5
- data/vss.gemspec +19 -33
- metadata +68 -37
- data/Manifest +0 -8
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright (c)
|
1
|
+
Copyright (c) 2011 Mark Dodwell, mkdynamic
|
2
2
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining
|
4
4
|
a copy of this software and associated documentation files (the
|
@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
17
17
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
18
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
19
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -4,7 +4,9 @@ A simple vector space search engine with **tf*idf** ranking.
|
|
4
4
|
|
5
5
|
[More info, and details of how it works.](http://madeofcode.com/posts/69-vss-a-vector-space-search-engine-in-ruby)
|
6
6
|
|
7
|
-
|
7
|
+
## Requirements
|
8
|
+
|
9
|
+
Ruby >= 1.8.7. Tested in (MRI 1.8.7 + 1.9.2).
|
8
10
|
|
9
11
|
## Install
|
10
12
|
|
@@ -23,7 +25,7 @@ To perform a search on a collection of documents:
|
|
23
25
|
|
24
26
|
## Rails/ActiveRecord
|
25
27
|
|
26
|
-
If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `
|
28
|
+
If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `Proc` when initializing `VSS::Engine` which will convert the objects into documents (which are simply strings). For example:
|
27
29
|
|
28
30
|
class Page < ActiveRecord::Base
|
29
31
|
#attrs: title, content
|
@@ -41,5 +43,4 @@ This isn't designed to be used on huge collections of records. The original use
|
|
41
43
|
|
42
44
|
Heavily inspired by [Joesph Wilk's article on building a vector space search engine in Python](http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html).
|
43
45
|
|
44
|
-
Written by Mark Dodwell
|
45
|
-
([Design & Code](http://madeofcode.com))
|
46
|
+
Written by Mark Dodwell ([@madeofcode](http://twitter.com/madeofcode))
|
data/Rakefile
CHANGED
@@ -1,9 +1,4 @@
|
|
1
|
-
require "
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
p.url = "http://github.com/mkdynamic/vss"
|
6
|
-
p.author = "Mark Dodwell"
|
7
|
-
p.email = "labs@mkdynamic.co.uk"
|
8
|
-
p.runtime_dependencies = ["stemmer >=1.0.1", "activesupport >=3.0.0"]
|
9
|
-
end
|
1
|
+
require "rubygems"
|
2
|
+
require "bundler/setup"
|
3
|
+
|
4
|
+
Bundler::GemHelper.install_tasks
|
data/lib/vss/engine.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'matrix'
|
2
|
+
require 'vss/tokenizer'
|
3
3
|
|
4
4
|
module VSS
|
5
5
|
class Engine
|
@@ -32,7 +32,13 @@ module VSS
|
|
32
32
|
|
33
33
|
# ranks from 0 to 100
|
34
34
|
def cosine_rank(vector1, vector2)
|
35
|
-
cosine(vector1, vector2)
|
35
|
+
cos = cosine(vector1, vector2)
|
36
|
+
|
37
|
+
if cos > 0
|
38
|
+
cos / 1 * 100
|
39
|
+
else
|
40
|
+
0
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
# see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
|
@@ -67,12 +73,11 @@ module VSS
|
|
67
73
|
end
|
68
74
|
|
69
75
|
def tf(token, tokens)
|
70
|
-
|
76
|
+
tokens.count { |t| t == token }
|
71
77
|
end
|
72
78
|
|
73
79
|
def idf(token, docs)
|
74
|
-
|
75
|
-
docs.size / docs_with_token_count
|
80
|
+
docs.size / docs.count { |d| tokenize(d).include?(token) }
|
76
81
|
end
|
77
82
|
|
78
83
|
# http://en.wikipedia.org/wiki/Tf-idf
|
@@ -97,16 +102,5 @@ module VSS
|
|
97
102
|
@tokenize_cache ||= {}
|
98
103
|
@tokenize_cache[string] ||= Tokenizer.tokenize(string)
|
99
104
|
end
|
100
|
-
|
101
|
-
# could use Array#count, but only for Ruby 1.8.7 >=
|
102
|
-
def count_in_array(array, item)
|
103
|
-
count = 0
|
104
|
-
if item.is_a? Proc
|
105
|
-
array.each { |i| count += 1 if item.call(i) }
|
106
|
-
else
|
107
|
-
array.each { |i| count += 1 if i == item }
|
108
|
-
end
|
109
|
-
count
|
110
|
-
end
|
111
105
|
end
|
112
|
-
end
|
106
|
+
end
|
data/lib/vss/tokenizer.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
require
|
2
|
-
require "active_support/core_ext"
|
1
|
+
require 'stemmer'
|
3
2
|
|
4
3
|
module VSS
|
5
4
|
class Tokenizer
|
@@ -7,12 +6,12 @@ module VSS
|
|
7
6
|
a b c d e f g h i j k l m n o p q r s t u v w x y z
|
8
7
|
an and are as at be by for from has he in is it its
|
9
8
|
of on that the to was were will with upon without among
|
10
|
-
]
|
9
|
+
].inject({}) { |h,v| h[v] = true; h }
|
11
10
|
|
12
11
|
def self.tokenize(string)
|
13
|
-
stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") #
|
14
|
-
words = stripped.split(/\s+/).reject(
|
15
|
-
words.reject { |word| STOP_WORDS.
|
12
|
+
stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") # removes punctuation
|
13
|
+
words = stripped.split(/\s+/).reject { |word| word.match(/^\s*$/) }.map(&:downcase).map(&:stem)
|
14
|
+
words.reject { |word| STOP_WORDS.key?(word) }.uniq
|
16
15
|
end
|
17
16
|
end
|
18
|
-
end
|
17
|
+
end
|
data/lib/vss/version.rb
ADDED
data/lib/vss.rb
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
require
|
1
|
+
require 'vss/version'
|
2
|
+
require 'vss/engine'
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'vss'
|
3
3
|
|
4
|
-
class
|
4
|
+
class VSSTest < Test::Unit::TestCase
|
5
5
|
def setup
|
6
6
|
@doc1 = "I'm not even going to mention any TV series."
|
7
7
|
@doc2 = "The Wire is the best thing ever. Fact."
|
@@ -27,9 +27,22 @@ class SearchTest < Test::Unit::TestCase
|
|
27
27
|
|
28
28
|
def test_ranking
|
29
29
|
results = @engine.search("How can you compare The Wire with Lost?")
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
assert_similar_float 82.1781, results[0].rank
|
31
|
+
assert_similar_float 3.08166, results[1].rank
|
32
|
+
assert_similar_float 1.37986, results[2].rank
|
33
|
+
assert_similar_float 0.87530, results[3].rank
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_no_match
|
37
|
+
results = @engine.search("Zebra funnels cash")
|
38
|
+
assert_equal 0, results.size
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def assert_similar_float(expected, actual, msg = nil)
|
44
|
+
assert Float === expected, "not a Float"
|
45
|
+
sig_figs = [10, actual.to_s.size, expected.to_s.size].min - 1
|
46
|
+
assert_equal expected.to_s[0, sig_figs], actual.to_s[0, sig_figs], msg
|
34
47
|
end
|
35
48
|
end
|
data/vss.gemspec
CHANGED
@@ -1,36 +1,22 @@
|
|
1
|
-
|
1
|
+
require File.expand_path('../lib/vss/version', __FILE__)
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
|
-
s.name
|
5
|
-
s.version
|
6
|
-
|
7
|
-
s.
|
8
|
-
s.
|
9
|
-
s.
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
|
13
|
-
s.
|
14
|
-
s.
|
15
|
-
s.
|
16
|
-
s.
|
17
|
-
|
18
|
-
s.
|
19
|
-
s.
|
20
|
-
s.
|
21
|
-
|
22
|
-
if s.respond_to? :specification_version then
|
23
|
-
s.specification_version = 3
|
24
|
-
|
25
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
26
|
-
s.add_runtime_dependency(%q<stemmer>, [">= 1.0.1"])
|
27
|
-
s.add_runtime_dependency(%q<activesupport>, [">= 3.0.0"])
|
28
|
-
else
|
29
|
-
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
30
|
-
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
31
|
-
end
|
32
|
-
else
|
33
|
-
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
34
|
-
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
35
|
-
end
|
4
|
+
s.name = "vss"
|
5
|
+
s.version = VSS::VERSION
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.authors = ["Mark Dodwell"]
|
8
|
+
s.email = ["labs@mkdynamic.co.uk"]
|
9
|
+
s.homepage = "https://github.com/mkdynamic/vss"
|
10
|
+
s.summary = "Vector Space Search"
|
11
|
+
s.description = "A simple vector space search engine with tf*idf ranking."
|
12
|
+
|
13
|
+
s.required_ruby_version = ">= 1.8.7"
|
14
|
+
s.add_development_dependency "bundler"
|
15
|
+
s.add_development_dependency "rake", "0.8.7"
|
16
|
+
s.add_runtime_dependency "stemmer"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
36
22
|
end
|
metadata
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 8
|
10
|
+
version: 0.1.8
|
6
11
|
platform: ruby
|
7
12
|
authors:
|
8
13
|
- Mark Dodwell
|
@@ -10,63 +15,81 @@ autorequire:
|
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
17
|
|
13
|
-
date: 2011-
|
18
|
+
date: 2011-07-05 00:00:00 -07:00
|
19
|
+
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
|
-
|
17
|
-
prerelease: false
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
19
23
|
none: false
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
23
|
-
|
24
|
-
|
25
|
-
|
27
|
+
hash: 3
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
requirement: *id001
|
32
|
+
type: :development
|
33
|
+
name: bundler
|
34
|
+
prerelease: false
|
26
35
|
- !ruby/object:Gem::Dependency
|
27
|
-
|
36
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - "="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
hash: 49
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 8
|
45
|
+
- 7
|
46
|
+
version: 0.8.7
|
47
|
+
requirement: *id002
|
48
|
+
type: :development
|
49
|
+
name: rake
|
28
50
|
prerelease: false
|
29
|
-
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
30
53
|
none: false
|
31
54
|
requirements:
|
32
55
|
- - ">="
|
33
56
|
- !ruby/object:Gem::Version
|
34
|
-
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
requirement: *id003
|
35
62
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
63
|
+
name: stemmer
|
64
|
+
prerelease: false
|
65
|
+
description: A simple vector space search engine with tf*idf ranking.
|
66
|
+
email:
|
67
|
+
- labs@mkdynamic.co.uk
|
39
68
|
executables: []
|
40
69
|
|
41
70
|
extensions: []
|
42
71
|
|
43
|
-
extra_rdoc_files:
|
44
|
-
|
45
|
-
- README.md
|
46
|
-
- lib/vss.rb
|
47
|
-
- lib/vss/engine.rb
|
48
|
-
- lib/vss/tokenizer.rb
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
49
74
|
files:
|
75
|
+
- .gitignore
|
76
|
+
- Gemfile
|
50
77
|
- LICENSE
|
51
78
|
- README.md
|
52
79
|
- Rakefile
|
53
80
|
- lib/vss.rb
|
54
81
|
- lib/vss/engine.rb
|
55
82
|
- lib/vss/tokenizer.rb
|
56
|
-
-
|
57
|
-
-
|
83
|
+
- lib/vss/version.rb
|
84
|
+
- test/test.rb
|
58
85
|
- vss.gemspec
|
59
|
-
|
86
|
+
has_rdoc: true
|
87
|
+
homepage: https://github.com/mkdynamic/vss
|
60
88
|
licenses: []
|
61
89
|
|
62
90
|
post_install_message:
|
63
|
-
rdoc_options:
|
64
|
-
|
65
|
-
- --inline-source
|
66
|
-
- --title
|
67
|
-
- Vss
|
68
|
-
- --main
|
69
|
-
- README.md
|
91
|
+
rdoc_options: []
|
92
|
+
|
70
93
|
require_paths:
|
71
94
|
- lib
|
72
95
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -74,19 +97,27 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
74
97
|
requirements:
|
75
98
|
- - ">="
|
76
99
|
- !ruby/object:Gem::Version
|
77
|
-
|
100
|
+
hash: 57
|
101
|
+
segments:
|
102
|
+
- 1
|
103
|
+
- 8
|
104
|
+
- 7
|
105
|
+
version: 1.8.7
|
78
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
107
|
none: false
|
80
108
|
requirements:
|
81
109
|
- - ">="
|
82
110
|
- !ruby/object:Gem::Version
|
83
|
-
|
111
|
+
hash: 3
|
112
|
+
segments:
|
113
|
+
- 0
|
114
|
+
version: "0"
|
84
115
|
requirements: []
|
85
116
|
|
86
|
-
rubyforge_project:
|
87
|
-
rubygems_version: 1.
|
117
|
+
rubyforge_project:
|
118
|
+
rubygems_version: 1.3.7
|
88
119
|
signing_key:
|
89
120
|
specification_version: 3
|
90
|
-
summary:
|
121
|
+
summary: Vector Space Search
|
91
122
|
test_files:
|
92
|
-
- test/
|
123
|
+
- test/test.rb
|