vss 0.1.4 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +2 -0
- data/LICENSE +2 -2
- data/README.md +5 -4
- data/Rakefile +4 -9
- data/lib/vss/engine.rb +12 -18
- data/lib/vss/tokenizer.rb +6 -7
- data/lib/vss/version.rb +3 -0
- data/lib/vss.rb +2 -1
- data/test/{search_test.rb → test.rb} +18 -5
- data/vss.gemspec +19 -33
- metadata +68 -37
- data/Manifest +0 -8
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright (c)
|
1
|
+
Copyright (c) 2011 Mark Dodwell, mkdynamic
|
2
2
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining
|
4
4
|
a copy of this software and associated documentation files (the
|
@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
17
17
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
18
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
19
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -4,7 +4,9 @@ A simple vector space search engine with **tf*idf** ranking.
|
|
4
4
|
|
5
5
|
[More info, and details of how it works.](http://madeofcode.com/posts/69-vss-a-vector-space-search-engine-in-ruby)
|
6
6
|
|
7
|
-
|
7
|
+
## Requirements
|
8
|
+
|
9
|
+
Ruby >= 1.8.7. Tested in (MRI 1.8.7 + 1.9.2).
|
8
10
|
|
9
11
|
## Install
|
10
12
|
|
@@ -23,7 +25,7 @@ To perform a search on a collection of documents:
|
|
23
25
|
|
24
26
|
## Rails/ActiveRecord
|
25
27
|
|
26
|
-
If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `
|
28
|
+
If you want to search a collection of `ActiveRecord` objects, you need to pass a **documentizer** `Proc` when initializing `VSS::Engine` which will convert the objects into documents (which are simply strings). For example:
|
27
29
|
|
28
30
|
class Page < ActiveRecord::Base
|
29
31
|
#attrs: title, content
|
@@ -41,5 +43,4 @@ This isn't designed to be used on huge collections of records. The original use
|
|
41
43
|
|
42
44
|
Heavily inspired by [Joesph Wilk's article on building a vector space search engine in Python](http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html).
|
43
45
|
|
44
|
-
Written by Mark Dodwell
|
45
|
-
([Design & Code](http://madeofcode.com))
|
46
|
+
Written by Mark Dodwell ([@madeofcode](http://twitter.com/madeofcode))
|
data/Rakefile
CHANGED
@@ -1,9 +1,4 @@
|
|
1
|
-
require "
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
p.url = "http://github.com/mkdynamic/vss"
|
6
|
-
p.author = "Mark Dodwell"
|
7
|
-
p.email = "labs@mkdynamic.co.uk"
|
8
|
-
p.runtime_dependencies = ["stemmer >=1.0.1", "activesupport >=3.0.0"]
|
9
|
-
end
|
1
|
+
require "rubygems"
|
2
|
+
require "bundler/setup"
|
3
|
+
|
4
|
+
Bundler::GemHelper.install_tasks
|
data/lib/vss/engine.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'matrix'
|
2
|
+
require 'vss/tokenizer'
|
3
3
|
|
4
4
|
module VSS
|
5
5
|
class Engine
|
@@ -32,7 +32,13 @@ module VSS
|
|
32
32
|
|
33
33
|
# ranks from 0 to 100
|
34
34
|
def cosine_rank(vector1, vector2)
|
35
|
-
cosine(vector1, vector2)
|
35
|
+
cos = cosine(vector1, vector2)
|
36
|
+
|
37
|
+
if cos > 0
|
38
|
+
cos / 1 * 100
|
39
|
+
else
|
40
|
+
0
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
# see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
|
@@ -67,12 +73,11 @@ module VSS
|
|
67
73
|
end
|
68
74
|
|
69
75
|
def tf(token, tokens)
|
70
|
-
|
76
|
+
tokens.count { |t| t == token }
|
71
77
|
end
|
72
78
|
|
73
79
|
def idf(token, docs)
|
74
|
-
|
75
|
-
docs.size / docs_with_token_count
|
80
|
+
docs.size / docs.count { |d| tokenize(d).include?(token) }
|
76
81
|
end
|
77
82
|
|
78
83
|
# http://en.wikipedia.org/wiki/Tf-idf
|
@@ -97,16 +102,5 @@ module VSS
|
|
97
102
|
@tokenize_cache ||= {}
|
98
103
|
@tokenize_cache[string] ||= Tokenizer.tokenize(string)
|
99
104
|
end
|
100
|
-
|
101
|
-
# could use Array#count, but only for Ruby 1.8.7 >=
|
102
|
-
def count_in_array(array, item)
|
103
|
-
count = 0
|
104
|
-
if item.is_a? Proc
|
105
|
-
array.each { |i| count += 1 if item.call(i) }
|
106
|
-
else
|
107
|
-
array.each { |i| count += 1 if i == item }
|
108
|
-
end
|
109
|
-
count
|
110
|
-
end
|
111
105
|
end
|
112
|
-
end
|
106
|
+
end
|
data/lib/vss/tokenizer.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
require
|
2
|
-
require "active_support/core_ext"
|
1
|
+
require 'stemmer'
|
3
2
|
|
4
3
|
module VSS
|
5
4
|
class Tokenizer
|
@@ -7,12 +6,12 @@ module VSS
|
|
7
6
|
a b c d e f g h i j k l m n o p q r s t u v w x y z
|
8
7
|
an and are as at be by for from has he in is it its
|
9
8
|
of on that the to was were will with upon without among
|
10
|
-
]
|
9
|
+
].inject({}) { |h,v| h[v] = true; h }
|
11
10
|
|
12
11
|
def self.tokenize(string)
|
13
|
-
stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") #
|
14
|
-
words = stripped.split(/\s+/).reject(
|
15
|
-
words.reject { |word| STOP_WORDS.
|
12
|
+
stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") # removes punctuation
|
13
|
+
words = stripped.split(/\s+/).reject { |word| word.match(/^\s*$/) }.map(&:downcase).map(&:stem)
|
14
|
+
words.reject { |word| STOP_WORDS.key?(word) }.uniq
|
16
15
|
end
|
17
16
|
end
|
18
|
-
end
|
17
|
+
end
|
data/lib/vss/version.rb
ADDED
data/lib/vss.rb
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
require
|
1
|
+
require 'vss/version'
|
2
|
+
require 'vss/engine'
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'vss'
|
3
3
|
|
4
|
-
class
|
4
|
+
class VSSTest < Test::Unit::TestCase
|
5
5
|
def setup
|
6
6
|
@doc1 = "I'm not even going to mention any TV series."
|
7
7
|
@doc2 = "The Wire is the best thing ever. Fact."
|
@@ -27,9 +27,22 @@ class SearchTest < Test::Unit::TestCase
|
|
27
27
|
|
28
28
|
def test_ranking
|
29
29
|
results = @engine.search("How can you compare The Wire with Lost?")
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
assert_similar_float 82.1781, results[0].rank
|
31
|
+
assert_similar_float 3.08166, results[1].rank
|
32
|
+
assert_similar_float 1.37986, results[2].rank
|
33
|
+
assert_similar_float 0.87530, results[3].rank
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_no_match
|
37
|
+
results = @engine.search("Zebra funnels cash")
|
38
|
+
assert_equal 0, results.size
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def assert_similar_float(expected, actual, msg = nil)
|
44
|
+
assert Float === expected, "not a Float"
|
45
|
+
sig_figs = [10, actual.to_s.size, expected.to_s.size].min - 1
|
46
|
+
assert_equal expected.to_s[0, sig_figs], actual.to_s[0, sig_figs], msg
|
34
47
|
end
|
35
48
|
end
|
data/vss.gemspec
CHANGED
@@ -1,36 +1,22 @@
|
|
1
|
-
|
1
|
+
require File.expand_path('../lib/vss/version', __FILE__)
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
|
-
s.name
|
5
|
-
s.version
|
6
|
-
|
7
|
-
s.
|
8
|
-
s.
|
9
|
-
s.
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
|
13
|
-
s.
|
14
|
-
s.
|
15
|
-
s.
|
16
|
-
s.
|
17
|
-
|
18
|
-
s.
|
19
|
-
s.
|
20
|
-
s.
|
21
|
-
|
22
|
-
if s.respond_to? :specification_version then
|
23
|
-
s.specification_version = 3
|
24
|
-
|
25
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
26
|
-
s.add_runtime_dependency(%q<stemmer>, [">= 1.0.1"])
|
27
|
-
s.add_runtime_dependency(%q<activesupport>, [">= 3.0.0"])
|
28
|
-
else
|
29
|
-
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
30
|
-
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
31
|
-
end
|
32
|
-
else
|
33
|
-
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
34
|
-
s.add_dependency(%q<activesupport>, [">= 3.0.0"])
|
35
|
-
end
|
4
|
+
s.name = "vss"
|
5
|
+
s.version = VSS::VERSION
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.authors = ["Mark Dodwell"]
|
8
|
+
s.email = ["labs@mkdynamic.co.uk"]
|
9
|
+
s.homepage = "https://github.com/mkdynamic/vss"
|
10
|
+
s.summary = "Vector Space Search"
|
11
|
+
s.description = "A simple vector space search engine with tf*idf ranking."
|
12
|
+
|
13
|
+
s.required_ruby_version = ">= 1.8.7"
|
14
|
+
s.add_development_dependency "bundler"
|
15
|
+
s.add_development_dependency "rake", "0.8.7"
|
16
|
+
s.add_runtime_dependency "stemmer"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
36
22
|
end
|
metadata
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 8
|
10
|
+
version: 0.1.8
|
6
11
|
platform: ruby
|
7
12
|
authors:
|
8
13
|
- Mark Dodwell
|
@@ -10,63 +15,81 @@ autorequire:
|
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
17
|
|
13
|
-
date: 2011-
|
18
|
+
date: 2011-07-05 00:00:00 -07:00
|
19
|
+
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
|
-
|
17
|
-
prerelease: false
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
19
23
|
none: false
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
23
|
-
|
24
|
-
|
25
|
-
|
27
|
+
hash: 3
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
requirement: *id001
|
32
|
+
type: :development
|
33
|
+
name: bundler
|
34
|
+
prerelease: false
|
26
35
|
- !ruby/object:Gem::Dependency
|
27
|
-
|
36
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - "="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
hash: 49
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 8
|
45
|
+
- 7
|
46
|
+
version: 0.8.7
|
47
|
+
requirement: *id002
|
48
|
+
type: :development
|
49
|
+
name: rake
|
28
50
|
prerelease: false
|
29
|
-
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
30
53
|
none: false
|
31
54
|
requirements:
|
32
55
|
- - ">="
|
33
56
|
- !ruby/object:Gem::Version
|
34
|
-
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
requirement: *id003
|
35
62
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
63
|
+
name: stemmer
|
64
|
+
prerelease: false
|
65
|
+
description: A simple vector space search engine with tf*idf ranking.
|
66
|
+
email:
|
67
|
+
- labs@mkdynamic.co.uk
|
39
68
|
executables: []
|
40
69
|
|
41
70
|
extensions: []
|
42
71
|
|
43
|
-
extra_rdoc_files:
|
44
|
-
|
45
|
-
- README.md
|
46
|
-
- lib/vss.rb
|
47
|
-
- lib/vss/engine.rb
|
48
|
-
- lib/vss/tokenizer.rb
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
49
74
|
files:
|
75
|
+
- .gitignore
|
76
|
+
- Gemfile
|
50
77
|
- LICENSE
|
51
78
|
- README.md
|
52
79
|
- Rakefile
|
53
80
|
- lib/vss.rb
|
54
81
|
- lib/vss/engine.rb
|
55
82
|
- lib/vss/tokenizer.rb
|
56
|
-
-
|
57
|
-
-
|
83
|
+
- lib/vss/version.rb
|
84
|
+
- test/test.rb
|
58
85
|
- vss.gemspec
|
59
|
-
|
86
|
+
has_rdoc: true
|
87
|
+
homepage: https://github.com/mkdynamic/vss
|
60
88
|
licenses: []
|
61
89
|
|
62
90
|
post_install_message:
|
63
|
-
rdoc_options:
|
64
|
-
|
65
|
-
- --inline-source
|
66
|
-
- --title
|
67
|
-
- Vss
|
68
|
-
- --main
|
69
|
-
- README.md
|
91
|
+
rdoc_options: []
|
92
|
+
|
70
93
|
require_paths:
|
71
94
|
- lib
|
72
95
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -74,19 +97,27 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
74
97
|
requirements:
|
75
98
|
- - ">="
|
76
99
|
- !ruby/object:Gem::Version
|
77
|
-
|
100
|
+
hash: 57
|
101
|
+
segments:
|
102
|
+
- 1
|
103
|
+
- 8
|
104
|
+
- 7
|
105
|
+
version: 1.8.7
|
78
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
107
|
none: false
|
80
108
|
requirements:
|
81
109
|
- - ">="
|
82
110
|
- !ruby/object:Gem::Version
|
83
|
-
|
111
|
+
hash: 3
|
112
|
+
segments:
|
113
|
+
- 0
|
114
|
+
version: "0"
|
84
115
|
requirements: []
|
85
116
|
|
86
|
-
rubyforge_project:
|
87
|
-
rubygems_version: 1.
|
117
|
+
rubyforge_project:
|
118
|
+
rubygems_version: 1.3.7
|
88
119
|
signing_key:
|
89
120
|
specification_version: 3
|
90
|
-
summary:
|
121
|
+
summary: Vector Space Search
|
91
122
|
test_files:
|
92
|
-
- test/
|
123
|
+
- test/test.rb
|