rsemantic 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/rsemantic.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{rsemantic}
3
+ s.version = "0.1.3"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Joseph Wilk"]
7
+ s.date = %q{2009-08-01}
8
+ s.description = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
9
+ s.email = ["joe@josephwilk.net"]
10
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt", "TODO.txt"]
11
+ s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile", "TODO.txt", "config/hoe.rb", "config/requirements.rb", "gem_tasks/deployment.rake", "gem_tasks/environment.rake", "gem_tasks/examples.rake", "gem_tasks/fix_cr_lf.rake", "gem_tasks/gemspec.rake", "gem_tasks/rspec.rake", "gem_tasks/website.rake", "lib/semantic.rb", "lib/semantic/compare.rb", "lib/semantic/matrix_transformer.rb", "lib/semantic/parser.rb", "lib/semantic/search.rb", "lib/semantic/transform.rb", "lib/semantic/transform/lsa_transform.rb", "lib/semantic/transform/tf_idf_transform.rb", "lib/semantic/vector_space.rb", "lib/semantic/vector_space/builder.rb", "lib/semantic/vector_space/model.rb", "lib/semantic/version.rb", "resources/english.stop", "rsemantic.gemspec", "spec/semantic/compare_spec.rb", "spec/semantic/matrix_transformer_spec.rb", "spec/semantic/parser_spec.rb", "spec/semantic/search_spec.rb", "spec/semantic/transform/lsa_transform_spec.rb", "spec/semantic/transform/tf_idf_transform_spec.rb", "spec/semantic/vector_space/builder_spec.rb", "spec/semantic/vector_space/model_spec.rb", "spec/spec.opts", "spec/spec_helper.rb"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/josephwilk/rsemantic}
14
+ s.rdoc_options = ["--main", "README.txt"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{rsemantic}
17
+ s.rubygems_version = %q{1.3.1}
18
+ s.summary = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
19
+
20
+ if s.respond_to? :specification_version then
21
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
22
+ s.specification_version = 2
23
+
24
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
25
+ s.add_runtime_dependency(%q<term-ansicolor>, [">= 1.0.3"])
26
+ s.add_runtime_dependency(%q<rspec>, [">= 1.1.5"])
27
+ s.add_runtime_dependency(%q<diff-lcs>, [">= 1.1.2"])
28
+ s.add_development_dependency(%q<hoe>, [">= 2.3.2"])
29
+ else
30
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
31
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
32
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
33
+ s.add_dependency(%q<hoe>, [">= 2.3.2"])
34
+ end
35
+ else
36
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
37
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
38
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
39
+ s.add_dependency(%q<hoe>, [">= 2.3.2"])
40
+ end
41
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Compare do
5
+
6
+ def vector(values)
7
+ Linalg::DMatrix.columns([values])
8
+ end
9
+
10
+ it "should calculate cosine" do
11
+ cosine = Compare.cosine( vector([0.1,0.5]), vector([0.9, 0.3]) )
12
+ cosine.should be_close(0.4961, 0.0001)
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,51 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe MatrixTransformer do
5
+
6
+ def mock_transform
7
+ @transform ||= mock(Transform)
8
+ end
9
+
10
+ def mock_vector_space
11
+ mock("vector space", :matrix => Linalg::DMatrix.rows([[1,0],[0,1]]), :matrix= => nil )
12
+ end
13
+
14
+
15
+ describe "transforming matrix" do
16
+
17
+ it "should ignore invalid transform class" do
18
+ matrix_transformer = MatrixTransformer.new(:transforms => [:FAKE])
19
+ lambda {
20
+ matrix_transformer.apply_transforms(mock_vector_space)
21
+ }.should_not raise_error
22
+ end
23
+
24
+ it "should use defaults transforms in none are specified" do
25
+ matrix_transformer = MatrixTransformer.new
26
+ Transform.should_receive(:const_get).with(:LSA).and_return(mock_transform)
27
+ Transform.should_receive(:const_get).with(:TFIDF).and_return(mock_transform)
28
+
29
+ matrix_transformer.apply_transforms(mock_vector_space)
30
+ end
31
+
32
+ it "should send transform message to class to transform matrix" do
33
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
34
+ Transform.stub!(:const_get).and_return(mock_transform)
35
+
36
+ mock_transform.should_receive(:transform)
37
+
38
+ matrix_transformer.apply_transforms(mock_vector_space)
39
+ end
40
+
41
+ it "should check that transform class is capable of transforming" do
42
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
43
+ Transform.stub!(:const_get).and_return(mock_transform)
44
+ mock_transform.should_receive(:respond_to?).with(:transform)
45
+
46
+ matrix_transformer.apply_transforms(mock_vector_space)
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,34 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Parser do
5
+
6
+ it "should remove stop words" do
7
+ file = mock("file")
8
+ file.stub!(:read).and_return("a to be")
9
+ File.stub!(:open).and_yield(file)
10
+ parser = Parser.new
11
+
12
+ parser.remove_stop_words(['a','house']).should == ['house']
13
+ end
14
+
15
+ it "should remove any non characters" do
16
+ file = mock("file")
17
+ file.stub!(:read).and_return("a to be")
18
+ File.stub!(:open).and_yield(file)
19
+
20
+ parser = Parser.new
21
+ parser.tokenise_and_stem("dragon.").should == ["dragon"]
22
+ end
23
+
24
+ it "should tokenise the string" do
25
+ parser = Parser.new
26
+
27
+ parser.stub!(:remove_stop_words).and_return(['mouse','trap'])
28
+ parser.should_receive(:tokenise_and_stem).and_return(['mouse','trap'])
29
+
30
+ parser.tokenise_and_filter(['the mouse trap'])
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,129 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Search do
5
+
6
+ documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
7
+
8
+ def mock_builder
9
+ @builder ||= mock(VectorSpace::Builder)
10
+ end
11
+
12
+ def mock_matrix_transformer
13
+ @matrix_transformer ||= mock(MatrixTransformer)
14
+ end
15
+
16
+ def query_vector
17
+ @query_vector ||= Linalg::DMatrix.columns([[1,0]])
18
+ end
19
+
20
+ def vector_space_model(stubs = {})
21
+ @vector_space_model ||= VectorSpace::Model.new(Linalg::DMatrix.rows([[0,1],[1,0]]), {})
22
+ end
23
+
24
+ def matrix(array)
25
+ Linalg::DMatrix.rows(array)
26
+ end
27
+
28
+ def vector(vector)
29
+ matrix([vector])
30
+ end
31
+
32
+ describe "setting up" do
33
+
34
+ it "should build the vector space" do
35
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
36
+ mock_builder.should_receive(:build_document_matrix).with(['test']).and_return(vector_space_model)
37
+
38
+ Search.new(['test'])
39
+ end
40
+
41
+ it "should transform matrices" do
42
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
43
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
44
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
45
+
46
+ #FIXME: with will not match vector_space_model, requests class Data. Think this is related to Delegate and Rspec
47
+ mock_matrix_transformer.should_receive(:apply_transforms).with(anything).and_return(vector_space_model)
48
+
49
+ Search.new(['test'])
50
+ end
51
+
52
+ end
53
+
54
+ describe "searching" do
55
+
56
+ it "should map search term to vector space" do
57
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
58
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
59
+
60
+ mock_builder.should_receive(:build_query_vector).with("cat").and_return(query_vector)
61
+
62
+ vector_search = Search.new(documents)
63
+ vector_search.search("cat")
64
+ end
65
+
66
+ it "should compare the documents using cosine" do
67
+ pending
68
+ end
69
+
70
+ end
71
+
72
+ describe "relating" do
73
+
74
+ it "should find related documents by comparing cosine" do
75
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
76
+
77
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
78
+
79
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
80
+ mock_matrix_transformer.stub!(:apply_transforms).and_return(vector_space_model)
81
+
82
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[0],[1]]))
83
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[1],[0]]))
84
+
85
+ vector_search = Search.new(documents)
86
+
87
+ vector_search.related(0)
88
+ end
89
+
90
+ end
91
+
92
+ describe "logging" do
93
+
94
+ before(:each) do
95
+ @out = StringIO.new
96
+ Semantic.logger = Logger.new(@out)
97
+ end
98
+
99
+ it "should set info level if in verbose mode" do
100
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
101
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
102
+
103
+ Search.new(['test'], :verbose => true)
104
+
105
+ Semantic.logger.level.should == Logger::INFO
106
+ end
107
+
108
+ it "should set error level if not in verbose mode" do
109
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
110
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
111
+
112
+ Search.new(['test'], :verbose => false)
113
+
114
+ Semantic.logger.level.should == Logger::ERROR
115
+ end
116
+
117
+ it "should default to error level if verbose is not specified" do
118
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
119
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
120
+
121
+ Search.new(['test'])
122
+
123
+ Semantic.logger.level.should == Logger::ERROR
124
+ end
125
+
126
+ end
127
+
128
+ end
129
+ end
@@ -0,0 +1,59 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::LSA do
5
+
6
+ tiny_matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
7
+ [1.0, 0.0, 1.0]])
8
+
9
+ u = Linalg::DMatrix.rows([[1,0],
10
+ [0,1]])
11
+
12
+ vt = Linalg::DMatrix.rows([[1,0,0],
13
+ [1,0,0],
14
+ [1,0,0]])
15
+
16
+ sigma = Linalg::DMatrix.rows([[1,0,0],
17
+ [0,1,0]])
18
+
19
+ describe "latent semantic analysis transform" do
20
+
21
+ it "should use svd on matrix" do
22
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
23
+ [1.0, 0.0, 1.0]])
24
+
25
+ matrix.should_receive(:singular_value_decomposition).and_return([u, sigma, vt])
26
+
27
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
28
+
29
+ Transform::LSA.transform(matrix)
30
+ end
31
+
32
+ it "should reduce the noise in the sigma matrix" do
33
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
34
+ [1.0, 0.0, 1.0]])
35
+
36
+ matrix.stub!(:singular_value_decomposition).and_return([u, sigma, vt])
37
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
38
+
39
+ sigma.should_receive(:[]=).with(0,0,0)
40
+ sigma.should_receive(:[]=).with(1,1,0)
41
+
42
+ Transform::LSA.transform(matrix, 2)
43
+ end
44
+
45
+ it "should prevent reducing dimensions greater than the matrixes own dimensions" do
46
+ lambda { Transform::LSA.transform tiny_matrix, 100 }.should raise_error(Exception)
47
+ end
48
+
49
+ it "should transform LSA matrix" do
50
+ transformed_matrix = Transform::LSA.transform tiny_matrix
51
+
52
+ #TODO: better way to compare result matrix
53
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0,0,0],[1,0,1]]).to_s
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,35 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::TFIDF do
5
+
6
+ def matrix(matrix)
7
+ Linalg::DMatrix.rows(matrix)
8
+ end
9
+
10
+ tiny_matrix = Linalg::DMatrix.rows([[0.0, 1.0, 0.0],
11
+ [1.0, 0.0, 1.0]])
12
+
13
+ describe "term frequency / inverse document frequency transform" do
14
+
15
+ it "should find the number of times each term occurs" do
16
+ Transform::TFIDF.should_receive(:number_of_documents_with_term).with(0, matrix([[1]])).and_return(2)
17
+
18
+ Transform::TFIDF.transform(matrix([[1]]))
19
+ end
20
+
21
+ it "should ignore counting terms with 0 weighting" do
22
+ Transform::TFIDF.should_not_receive(:number_of_documents_with_term)
23
+
24
+ Transform::TFIDF.transform(matrix([[0,0],[0,0]]))
25
+ end
26
+
27
+ it "should calculate term frequency * inverse document freuency" do
28
+ transformed_matrix = Transform::TFIDF.transform matrix([[1,1],[0,1]])
29
+
30
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0, 0],[0, 0.346574]]).to_s
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+ describe Builder do
6
+
7
+ def mock_parser
8
+ @parser ||= mock("Parser")
9
+ end
10
+
11
+ def documents
12
+ ['nipon','ichiban']
13
+ end
14
+
15
+
16
+ describe "building query vector" do
17
+
18
+ it "should build vector from string" do
19
+ builder = Builder.new
20
+ builder.should_receive(:build_vector).with("query string")
21
+
22
+ builder.build_query_vector(["query","string"])
23
+ end
24
+
25
+ it "should generate a valid vector" do
26
+ builder = Builder.new
27
+ builder.build_document_matrix(["query string"])
28
+ query = builder.build_query_vector(["query","string"])
29
+
30
+ query.should == Linalg::DMatrix.columns([[1,1]])
31
+ end
32
+
33
+ it "should generate empty vector when terms are not in document matrix" do
34
+ builder = Builder.new
35
+ builder.build_document_matrix(["string"])
36
+ query = builder.build_query_vector(["not-in-document"])
37
+
38
+ query.should == Linalg::DMatrix.columns([[0]])
39
+ end
40
+
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,22 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+
6
+ describe Model do
7
+
8
+ it "should output a DMatrix as a pretty string" do
9
+ model = Model.new(Linalg::DMatrix.columns([[0.11111,0.66666],[0.33333, 0.001]]), {})
10
+
11
+ model.to_s.should include("[ +0.11 +0.33 ]\n[ +0.67 +0.00 ]\n")
12
+ end
13
+
14
+ it "should output keywords for the matrix rows" do
15
+ model = Model.new(Linalg::DMatrix.columns([[0]]), {'shiva' => 0})
16
+
17
+ model.to_s.should include("shiva [ +0.00 ]")
18
+ end
19
+
20
+ end
21
+ end
22
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --diff