rsemantic 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/rsemantic.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{rsemantic}
3
+ s.version = "0.1.3"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Joseph Wilk"]
7
+ s.date = %q{2009-08-01}
8
+ s.description = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
9
+ s.email = ["joe@josephwilk.net"]
10
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt", "TODO.txt"]
11
+ s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile", "TODO.txt", "config/hoe.rb", "config/requirements.rb", "gem_tasks/deployment.rake", "gem_tasks/environment.rake", "gem_tasks/examples.rake", "gem_tasks/fix_cr_lf.rake", "gem_tasks/gemspec.rake", "gem_tasks/rspec.rake", "gem_tasks/website.rake", "lib/semantic.rb", "lib/semantic/compare.rb", "lib/semantic/matrix_transformer.rb", "lib/semantic/parser.rb", "lib/semantic/search.rb", "lib/semantic/transform.rb", "lib/semantic/transform/lsa_transform.rb", "lib/semantic/transform/tf_idf_transform.rb", "lib/semantic/vector_space.rb", "lib/semantic/vector_space/builder.rb", "lib/semantic/vector_space/model.rb", "lib/semantic/version.rb", "resources/english.stop", "rsemantic.gemspec", "spec/semantic/compare_spec.rb", "spec/semantic/matrix_transformer_spec.rb", "spec/semantic/parser_spec.rb", "spec/semantic/search_spec.rb", "spec/semantic/transform/lsa_transform_spec.rb", "spec/semantic/transform/tf_idf_transform_spec.rb", "spec/semantic/vector_space/builder_spec.rb", "spec/semantic/vector_space/model_spec.rb", "spec/spec.opts", "spec/spec_helper.rb"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/josephwilk/rsemantic}
14
+ s.rdoc_options = ["--main", "README.txt"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{rsemantic}
17
+ s.rubygems_version = %q{1.3.1}
18
+ s.summary = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
19
+
20
+ if s.respond_to? :specification_version then
21
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
22
+ s.specification_version = 2
23
+
24
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
25
+ s.add_runtime_dependency(%q<term-ansicolor>, [">= 1.0.3"])
26
+ s.add_runtime_dependency(%q<rspec>, [">= 1.1.5"])
27
+ s.add_runtime_dependency(%q<diff-lcs>, [">= 1.1.2"])
28
+ s.add_development_dependency(%q<hoe>, [">= 2.3.2"])
29
+ else
30
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
31
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
32
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
33
+ s.add_dependency(%q<hoe>, [">= 2.3.2"])
34
+ end
35
+ else
36
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
37
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
38
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
39
+ s.add_dependency(%q<hoe>, [">= 2.3.2"])
40
+ end
41
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Compare do
5
+
6
+ def vector(values)
7
+ Linalg::DMatrix.columns([values])
8
+ end
9
+
10
+ it "should calculate cosine" do
11
+ cosine = Compare.cosine( vector([0.1,0.5]), vector([0.9, 0.3]) )
12
+ cosine.should be_close(0.4961, 0.0001)
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,51 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe MatrixTransformer do
5
+
6
+ def mock_transform
7
+ @transform ||= mock(Transform)
8
+ end
9
+
10
+ def mock_vector_space
11
+ mock("vector space", :matrix => Linalg::DMatrix.rows([[1,0],[0,1]]), :matrix= => nil )
12
+ end
13
+
14
+
15
+ describe "transforming matrix" do
16
+
17
+ it "should ignore invalid transform class" do
18
+ matrix_transformer = MatrixTransformer.new(:transforms => [:FAKE])
19
+ lambda {
20
+ matrix_transformer.apply_transforms(mock_vector_space)
21
+ }.should_not raise_error
22
+ end
23
+
24
+ it "should use defaults transforms in none are specified" do
25
+ matrix_transformer = MatrixTransformer.new
26
+ Transform.should_receive(:const_get).with(:LSA).and_return(mock_transform)
27
+ Transform.should_receive(:const_get).with(:TFIDF).and_return(mock_transform)
28
+
29
+ matrix_transformer.apply_transforms(mock_vector_space)
30
+ end
31
+
32
+ it "should send transform message to class to transform matrix" do
33
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
34
+ Transform.stub!(:const_get).and_return(mock_transform)
35
+
36
+ mock_transform.should_receive(:transform)
37
+
38
+ matrix_transformer.apply_transforms(mock_vector_space)
39
+ end
40
+
41
+ it "should check that transform class is capable of transforming" do
42
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
43
+ Transform.stub!(:const_get).and_return(mock_transform)
44
+ mock_transform.should_receive(:respond_to?).with(:transform)
45
+
46
+ matrix_transformer.apply_transforms(mock_vector_space)
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,34 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Parser do
5
+
6
+ it "should remove stop words" do
7
+ file = mock("file")
8
+ file.stub!(:read).and_return("a to be")
9
+ File.stub!(:open).and_yield(file)
10
+ parser = Parser.new
11
+
12
+ parser.remove_stop_words(['a','house']).should == ['house']
13
+ end
14
+
15
+ it "should remove any non characters" do
16
+ file = mock("file")
17
+ file.stub!(:read).and_return("a to be")
18
+ File.stub!(:open).and_yield(file)
19
+
20
+ parser = Parser.new
21
+ parser.tokenise_and_stem("dragon.").should == ["dragon"]
22
+ end
23
+
24
+ it "should tokenise the string" do
25
+ parser = Parser.new
26
+
27
+ parser.stub!(:remove_stop_words).and_return(['mouse','trap'])
28
+ parser.should_receive(:tokenise_and_stem).and_return(['mouse','trap'])
29
+
30
+ parser.tokenise_and_filter(['the mouse trap'])
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,129 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Search do
5
+
6
+ documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
7
+
8
+ def mock_builder
9
+ @builder ||= mock(VectorSpace::Builder)
10
+ end
11
+
12
+ def mock_matrix_transformer
13
+ @matrix_transformer ||= mock(MatrixTransformer)
14
+ end
15
+
16
+ def query_vector
17
+ @query_vector ||= Linalg::DMatrix.columns([[1,0]])
18
+ end
19
+
20
+ def vector_space_model(stubs = {})
21
+ @vector_space_model ||= VectorSpace::Model.new(Linalg::DMatrix.rows([[0,1],[1,0]]), {})
22
+ end
23
+
24
+ def matrix(array)
25
+ Linalg::DMatrix.rows(array)
26
+ end
27
+
28
+ def vector(vector)
29
+ matrix([vector])
30
+ end
31
+
32
+ describe "setting up" do
33
+
34
+ it "should build the vector space" do
35
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
36
+ mock_builder.should_receive(:build_document_matrix).with(['test']).and_return(vector_space_model)
37
+
38
+ Search.new(['test'])
39
+ end
40
+
41
+ it "should transform matrices" do
42
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
43
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
44
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
45
+
46
+ #FIXME: with will not match vector_space_model, requests class Data. Think this is related to Delegate and Rspec
47
+ mock_matrix_transformer.should_receive(:apply_transforms).with(anything).and_return(vector_space_model)
48
+
49
+ Search.new(['test'])
50
+ end
51
+
52
+ end
53
+
54
+ describe "searching" do
55
+
56
+ it "should map search term to vector space" do
57
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
58
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
59
+
60
+ mock_builder.should_receive(:build_query_vector).with("cat").and_return(query_vector)
61
+
62
+ vector_search = Search.new(documents)
63
+ vector_search.search("cat")
64
+ end
65
+
66
+ it "should compare the documents using cosine" do
67
+ pending
68
+ end
69
+
70
+ end
71
+
72
+ describe "relating" do
73
+
74
+ it "should find related documents by comparing cosine" do
75
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
76
+
77
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
78
+
79
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
80
+ mock_matrix_transformer.stub!(:apply_transforms).and_return(vector_space_model)
81
+
82
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[0],[1]]))
83
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[1],[0]]))
84
+
85
+ vector_search = Search.new(documents)
86
+
87
+ vector_search.related(0)
88
+ end
89
+
90
+ end
91
+
92
+ describe "logging" do
93
+
94
+ before(:each) do
95
+ @out = StringIO.new
96
+ Semantic.logger = Logger.new(@out)
97
+ end
98
+
99
+ it "should set info level if in verbose mode" do
100
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
101
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
102
+
103
+ Search.new(['test'], :verbose => true)
104
+
105
+ Semantic.logger.level.should == Logger::INFO
106
+ end
107
+
108
+ it "should set error level if not in verbose mode" do
109
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
110
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
111
+
112
+ Search.new(['test'], :verbose => false)
113
+
114
+ Semantic.logger.level.should == Logger::ERROR
115
+ end
116
+
117
+ it "should default to error level if verbose is not specified" do
118
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
119
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
120
+
121
+ Search.new(['test'])
122
+
123
+ Semantic.logger.level.should == Logger::ERROR
124
+ end
125
+
126
+ end
127
+
128
+ end
129
+ end
@@ -0,0 +1,59 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::LSA do
5
+
6
+ tiny_matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
7
+ [1.0, 0.0, 1.0]])
8
+
9
+ u = Linalg::DMatrix.rows([[1,0],
10
+ [0,1]])
11
+
12
+ vt = Linalg::DMatrix.rows([[1,0,0],
13
+ [1,0,0],
14
+ [1,0,0]])
15
+
16
+ sigma = Linalg::DMatrix.rows([[1,0,0],
17
+ [0,1,0]])
18
+
19
+ describe "latent semantic analysis transform" do
20
+
21
+ it "should use svd on matrix" do
22
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
23
+ [1.0, 0.0, 1.0]])
24
+
25
+ matrix.should_receive(:singular_value_decomposition).and_return([u, sigma, vt])
26
+
27
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
28
+
29
+ Transform::LSA.transform(matrix)
30
+ end
31
+
32
+ it "should reduce the noise in the sigma matrix" do
33
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
34
+ [1.0, 0.0, 1.0]])
35
+
36
+ matrix.stub!(:singular_value_decomposition).and_return([u, sigma, vt])
37
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
38
+
39
+ sigma.should_receive(:[]=).with(0,0,0)
40
+ sigma.should_receive(:[]=).with(1,1,0)
41
+
42
+ Transform::LSA.transform(matrix, 2)
43
+ end
44
+
45
+ it "should prevent reducing dimensions greater than the matrixes own dimensions" do
46
+ lambda { Transform::LSA.transform tiny_matrix, 100 }.should raise_error(Exception)
47
+ end
48
+
49
+ it "should transform LSA matrix" do
50
+ transformed_matrix = Transform::LSA.transform tiny_matrix
51
+
52
+ #TODO: better way to compare result matrix
53
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0,0,0],[1,0,1]]).to_s
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,35 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::TFIDF do
5
+
6
+ def matrix(matrix)
7
+ Linalg::DMatrix.rows(matrix)
8
+ end
9
+
10
+ tiny_matrix = Linalg::DMatrix.rows([[0.0, 1.0, 0.0],
11
+ [1.0, 0.0, 1.0]])
12
+
13
+ describe "term frequency / inverse document frequency transform" do
14
+
15
+ it "should find the number of times each term occurs" do
16
+ Transform::TFIDF.should_receive(:number_of_documents_with_term).with(0, matrix([[1]])).and_return(2)
17
+
18
+ Transform::TFIDF.transform(matrix([[1]]))
19
+ end
20
+
21
+ it "should ignore counting terms with 0 weighting" do
22
+ Transform::TFIDF.should_not_receive(:number_of_documents_with_term)
23
+
24
+ Transform::TFIDF.transform(matrix([[0,0],[0,0]]))
25
+ end
26
+
27
+ it "should calculate term frequency * inverse document freuency" do
28
+ transformed_matrix = Transform::TFIDF.transform matrix([[1,1],[0,1]])
29
+
30
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0, 0],[0, 0.346574]]).to_s
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+ describe Builder do
6
+
7
+ def mock_parser
8
+ @parser ||= mock("Parser")
9
+ end
10
+
11
+ def documents
12
+ ['nipon','ichiban']
13
+ end
14
+
15
+
16
+ describe "building query vector" do
17
+
18
+ it "should build vector from string" do
19
+ builder = Builder.new
20
+ builder.should_receive(:build_vector).with("query string")
21
+
22
+ builder.build_query_vector(["query","string"])
23
+ end
24
+
25
+ it "should generate a valid vector" do
26
+ builder = Builder.new
27
+ builder.build_document_matrix(["query string"])
28
+ query = builder.build_query_vector(["query","string"])
29
+
30
+ query.should == Linalg::DMatrix.columns([[1,1]])
31
+ end
32
+
33
+ it "should generate empty vector when terms are not in document matrix" do
34
+ builder = Builder.new
35
+ builder.build_document_matrix(["string"])
36
+ query = builder.build_query_vector(["not-in-document"])
37
+
38
+ query.should == Linalg::DMatrix.columns([[0]])
39
+ end
40
+
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,22 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+
6
+ describe Model do
7
+
8
+ it "should output a DMatrix as a pretty string" do
9
+ model = Model.new(Linalg::DMatrix.columns([[0.11111,0.66666],[0.33333, 0.001]]), {})
10
+
11
+ model.to_s.should include("[ +0.11 +0.33 ]\n[ +0.67 +0.00 ]\n")
12
+ end
13
+
14
+ it "should output keywords for the matrix rows" do
15
+ model = Model.new(Linalg::DMatrix.columns([[0]]), {'shiva' => 0})
16
+
17
+ model.to_s.should include("shiva [ +0.00 ]")
18
+ end
19
+
20
+ end
21
+ end
22
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --diff