josephwilk-rsemantic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +38 -0
- data/README.txt +48 -0
- data/Rakefile +9 -0
- data/TODO.txt +9 -0
- data/config/hoe.rb +69 -0
- data/config/requirements.rb +15 -0
- data/gem_tasks/deployment.rake +34 -0
- data/gem_tasks/environment.rake +7 -0
- data/gem_tasks/examples.rake +29 -0
- data/gem_tasks/fix_cr_lf.rake +10 -0
- data/gem_tasks/gemspec.rake +6 -0
- data/gem_tasks/rspec.rake +33 -0
- data/gem_tasks/website.rake +17 -0
- data/lib/semantic/compare.rb +19 -0
- data/lib/semantic/matrix_transformer.rb +25 -0
- data/lib/semantic/parser.rb +40 -0
- data/lib/semantic/search.rb +35 -0
- data/lib/semantic/transform/lsa_transform.rb +34 -0
- data/lib/semantic/transform/tf_idf_transform.rb +42 -0
- data/lib/semantic/transform.rb +1 -0
- data/lib/semantic/vector_space/builder.rb +69 -0
- data/lib/semantic/vector_space/model.rb +45 -0
- data/lib/semantic/vector_space.rb +1 -0
- data/lib/semantic/version.rb +9 -0
- data/lib/semantic.rb +29 -0
- data/resources/english.stop +571 -0
- data/rsemantic.gemspec +41 -0
- data/spec/semantic/compare_spec.rb +16 -0
- data/spec/semantic/matrix_transformer_spec.rb +51 -0
- data/spec/semantic/parser_spec.rb +34 -0
- data/spec/semantic/search_spec.rb +93 -0
- data/spec/semantic/transform/lsa_transform_spec.rb +59 -0
- data/spec/semantic/transform/tf_idf_transform_spec.rb +35 -0
- data/spec/semantic/vector_space/builder_spec.rb +44 -0
- data/spec/semantic/vector_space/model_spec.rb +22 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +7 -0
- metadata +130 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
module Semantic
|
4
|
+
describe Search do
|
5
|
+
|
6
|
+
documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
|
7
|
+
|
8
|
+
def mock_builder
|
9
|
+
@builder ||= mock(VectorSpace::Builder)
|
10
|
+
end
|
11
|
+
|
12
|
+
def mock_matrix_transformer
|
13
|
+
@matrix_transformer ||= mock(MatrixTransformer)
|
14
|
+
end
|
15
|
+
|
16
|
+
def query_vector
|
17
|
+
@query_vector ||= Linalg::DMatrix.columns([[1,0]])
|
18
|
+
end
|
19
|
+
|
20
|
+
def vector_space_model(stubs = {})
|
21
|
+
@vector_space_model ||= VectorSpace::Model.new(Linalg::DMatrix.rows([[0,1],[1,0]]), [])
|
22
|
+
end
|
23
|
+
|
24
|
+
def matrix(array)
|
25
|
+
Linalg::DMatrix.rows(array)
|
26
|
+
end
|
27
|
+
|
28
|
+
def vector(vector)
|
29
|
+
matrix([vector])
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "setting up" do
|
33
|
+
|
34
|
+
it "should build the vector space" do
|
35
|
+
VectorSpace::Builder.stub!(:new).and_return(mock_builder)
|
36
|
+
mock_builder.should_receive(:build_document_matrix).with(['test']).and_return(vector_space_model)
|
37
|
+
|
38
|
+
Search.new(['test'])
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should transform matrices" do
|
42
|
+
MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
|
43
|
+
VectorSpace::Builder.stub!(:new).and_return(mock_builder)
|
44
|
+
mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
|
45
|
+
|
46
|
+
#FIXME: with will not match vector_space_model, requests class Data. Think this is related to Delegate and Rspec
|
47
|
+
mock_matrix_transformer.should_receive(:apply_transforms).with(anything).and_return(vector_space_model)
|
48
|
+
|
49
|
+
Search.new(['test'])
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "searching" do
|
55
|
+
|
56
|
+
it "should map search term to vector space" do
|
57
|
+
VectorSpace::Builder.stub!(:new).and_return(mock_builder)
|
58
|
+
mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
|
59
|
+
|
60
|
+
mock_builder.should_receive(:build_query_vector).with("cat").and_return(query_vector)
|
61
|
+
|
62
|
+
vector_search = Search.new(documents)
|
63
|
+
vector_search.search("cat")
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should compare the documents using cosine" do
|
67
|
+
pending
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "relating" do
|
73
|
+
|
74
|
+
it "should find related documents by comparing cosine" do
|
75
|
+
VectorSpace::Builder.stub!(:new).and_return(mock_builder)
|
76
|
+
|
77
|
+
mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
|
78
|
+
|
79
|
+
MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
|
80
|
+
mock_matrix_transformer.stub!(:apply_transforms).and_return(vector_space_model)
|
81
|
+
|
82
|
+
Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[0],[1]]))
|
83
|
+
Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[1],[0]]))
|
84
|
+
|
85
|
+
vector_search = Search.new(documents)
|
86
|
+
|
87
|
+
vector_search.related(0)
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
module Semantic
|
4
|
+
describe Transform::LSA do
|
5
|
+
|
6
|
+
tiny_matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
|
7
|
+
[1.0, 0.0, 1.0]])
|
8
|
+
|
9
|
+
u = Linalg::DMatrix.rows([[1,0],
|
10
|
+
[0,1]])
|
11
|
+
|
12
|
+
vt = Linalg::DMatrix.rows([[1,0,0],
|
13
|
+
[1,0,0],
|
14
|
+
[1,0,0]])
|
15
|
+
|
16
|
+
sigma = Linalg::DMatrix.rows([[1,0,0],
|
17
|
+
[0,1,0]])
|
18
|
+
|
19
|
+
describe "latent semantic analysis transform" do
|
20
|
+
|
21
|
+
it "should use svd on matrix" do
|
22
|
+
matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
|
23
|
+
[1.0, 0.0, 1.0]])
|
24
|
+
|
25
|
+
matrix.should_receive(:singular_value_decomposition).and_return([u, sigma, vt])
|
26
|
+
|
27
|
+
Linalg::DMatrix.stub!(:columns).and_return(matrix)
|
28
|
+
|
29
|
+
Transform::LSA.transform(matrix)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should reduce the noise in the sigma matrix" do
|
33
|
+
matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
|
34
|
+
[1.0, 0.0, 1.0]])
|
35
|
+
|
36
|
+
matrix.stub!(:singular_value_decomposition).and_return([u, sigma, vt])
|
37
|
+
Linalg::DMatrix.stub!(:columns).and_return(matrix)
|
38
|
+
|
39
|
+
sigma.should_receive(:[]=).with(0,0,0)
|
40
|
+
sigma.should_receive(:[]=).with(1,1,0)
|
41
|
+
|
42
|
+
Transform::LSA.transform(matrix, 2)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should prevent reducing dimensions greater than the matrixes own dimensions" do
|
46
|
+
lambda { Transform::LSA.transform tiny_matrix, 100 }.should raise_error(Exception)
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should transform LSA matrix" do
|
50
|
+
transformed_matrix = Transform::LSA.transform tiny_matrix
|
51
|
+
|
52
|
+
#TODO: better way to compare result matrix
|
53
|
+
transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0,0,0],[1,0,1]]).to_s
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
module Semantic
|
4
|
+
describe Transform::TFIDF do
|
5
|
+
|
6
|
+
def matrix(matrix)
|
7
|
+
Linalg::DMatrix.rows(matrix)
|
8
|
+
end
|
9
|
+
|
10
|
+
tiny_matrix = Linalg::DMatrix.rows([[0.0, 1.0, 0.0],
|
11
|
+
[1.0, 0.0, 1.0]])
|
12
|
+
|
13
|
+
describe "term frequency / inverse document frequency transform" do
|
14
|
+
|
15
|
+
it "should find the number of times each term occurs" do
|
16
|
+
Transform::TFIDF.should_receive(:number_of_documents_with_term).with(0, matrix([[1]])).and_return(2)
|
17
|
+
|
18
|
+
Transform::TFIDF.transform(matrix([[1]]))
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore counting terms with 0 weighting" do
|
22
|
+
Transform::TFIDF.should_not_receive(:number_of_documents_with_term)
|
23
|
+
|
24
|
+
Transform::TFIDF.transform(matrix([[0,0],[0,0]]))
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should calculate term frequency * inverse document freuency" do
|
28
|
+
transformed_matrix = Transform::TFIDF.transform matrix([[1,1],[0,1]])
|
29
|
+
|
30
|
+
transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0, 0],[0, 0.346574]]).to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
module Semantic
|
4
|
+
module VectorSpace
|
5
|
+
describe Builder do
|
6
|
+
|
7
|
+
def mock_parser
|
8
|
+
@parser ||= mock("Parser")
|
9
|
+
end
|
10
|
+
|
11
|
+
def documents
|
12
|
+
['nipon','ichiban']
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
describe "building query vector" do
|
17
|
+
|
18
|
+
it "should build vector from string" do
|
19
|
+
builder = Builder.new
|
20
|
+
builder.should_receive(:build_vector).with("query string")
|
21
|
+
|
22
|
+
builder.build_query_vector(["query","string"])
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should generate a valid vector" do
|
26
|
+
builder = Builder.new
|
27
|
+
builder.build_document_matrix(["query string"])
|
28
|
+
query = builder.build_query_vector(["query","string"])
|
29
|
+
|
30
|
+
query.should == Linalg::DMatrix.columns([[1,1]])
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should generate empty vector when terms are not in document matrix" do
|
34
|
+
builder = Builder.new
|
35
|
+
builder.build_document_matrix(["string"])
|
36
|
+
query = builder.build_query_vector(["not-in-document"])
|
37
|
+
|
38
|
+
query.should == Linalg::DMatrix.columns([[0]])
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
module Semantic
|
4
|
+
module VectorSpace
|
5
|
+
|
6
|
+
describe Model do
|
7
|
+
|
8
|
+
it "should output a DMatrix as a pretty string" do
|
9
|
+
model = Model.new(Linalg::DMatrix.columns([[0.11111,0.66666],[0.33333, 0.001]]), {})
|
10
|
+
|
11
|
+
model.to_s.should include("[ +0.11 +0.33 ]\n[ +0.67 +0.00 ]\n")
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should output keywords for the matrix rows" do
|
15
|
+
model = Model.new(Linalg::DMatrix.columns([[0]]), {'shiva' => 0})
|
16
|
+
|
17
|
+
model.to_s.should include("shiva [ +0.00 ]")
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: josephwilk-rsemantic
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joseph Wilk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-13 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: term-ansicolor
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.0.3
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: rspec
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 1.1.5
|
32
|
+
version:
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: diff-lcs
|
35
|
+
version_requirement:
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.2
|
41
|
+
version:
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: hoe
|
44
|
+
version_requirement:
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.8.2
|
50
|
+
version:
|
51
|
+
description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
|
52
|
+
email:
|
53
|
+
- josephwilk@joesniff.co.uk
|
54
|
+
executables: []
|
55
|
+
|
56
|
+
extensions: []
|
57
|
+
|
58
|
+
extra_rdoc_files:
|
59
|
+
- History.txt
|
60
|
+
- Manifest.txt
|
61
|
+
- README.txt
|
62
|
+
- TODO.txt
|
63
|
+
files:
|
64
|
+
- History.txt
|
65
|
+
- Manifest.txt
|
66
|
+
- README.txt
|
67
|
+
- Rakefile
|
68
|
+
- TODO.txt
|
69
|
+
- config/hoe.rb
|
70
|
+
- config/requirements.rb
|
71
|
+
- gem_tasks/deployment.rake
|
72
|
+
- gem_tasks/environment.rake
|
73
|
+
- gem_tasks/examples.rake
|
74
|
+
- gem_tasks/fix_cr_lf.rake
|
75
|
+
- gem_tasks/gemspec.rake
|
76
|
+
- gem_tasks/rspec.rake
|
77
|
+
- gem_tasks/website.rake
|
78
|
+
- lib/semantic.rb
|
79
|
+
- lib/semantic/compare.rb
|
80
|
+
- lib/semantic/matrix_transformer.rb
|
81
|
+
- lib/semantic/parser.rb
|
82
|
+
- lib/semantic/search.rb
|
83
|
+
- lib/semantic/transform.rb
|
84
|
+
- lib/semantic/transform/lsa_transform.rb
|
85
|
+
- lib/semantic/transform/tf_idf_transform.rb
|
86
|
+
- lib/semantic/vector_space.rb
|
87
|
+
- lib/semantic/vector_space/builder.rb
|
88
|
+
- lib/semantic/vector_space/model.rb
|
89
|
+
- lib/semantic/version.rb
|
90
|
+
- resources/english.stop
|
91
|
+
- rsemantic.gemspec
|
92
|
+
- spec/semantic/compare_spec.rb
|
93
|
+
- spec/semantic/matrix_transformer_spec.rb
|
94
|
+
- spec/semantic/parser_spec.rb
|
95
|
+
- spec/semantic/search_spec.rb
|
96
|
+
- spec/semantic/transform/lsa_transform_spec.rb
|
97
|
+
- spec/semantic/transform/tf_idf_transform_spec.rb
|
98
|
+
- spec/semantic/vector_space/builder_spec.rb
|
99
|
+
- spec/semantic/vector_space/model_spec.rb
|
100
|
+
- spec/spec.opts
|
101
|
+
- spec/spec_helper.rb
|
102
|
+
has_rdoc: true
|
103
|
+
homepage: http://github.com/josephwilk/rsemantic
|
104
|
+
post_install_message:
|
105
|
+
rdoc_options:
|
106
|
+
- --main
|
107
|
+
- README.txt
|
108
|
+
require_paths:
|
109
|
+
- lib
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: "0"
|
115
|
+
version:
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: "0"
|
121
|
+
version:
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project: rsemantic
|
125
|
+
rubygems_version: 1.2.0
|
126
|
+
signing_key:
|
127
|
+
specification_version: 2
|
128
|
+
summary: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
|
129
|
+
test_files: []
|
130
|
+
|