josephwilk-semantic 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/rsemantic.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{semantic}
3
+ s.version = "0.1.0"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Joseph Wilk"]
7
+ s.date = %q{2008-11-13}
8
+ s.description = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
9
+ s.email = ["josephwilk@joesniff.co.uk"]
10
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt", "TODO.txt"]
11
+ s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile", "TODO.txt", "config/hoe.rb", "config/requirements.rb", "gem_tasks/deployment.rake", "gem_tasks/environment.rake", "gem_tasks/examples.rake", "gem_tasks/fix_cr_lf.rake", "gem_tasks/gemspec.rake", "gem_tasks/rspec.rake", "gem_tasks/website.rake", "lib/semantic.rb", "lib/semantic/compare.rb", "lib/semantic/matrix_transformer.rb", "lib/semantic/parser.rb", "lib/semantic/search.rb", "lib/semantic/transform.rb", "lib/semantic/transform/lsa_transform.rb", "lib/semantic/transform/tf_idf_transform.rb", "lib/semantic/vector_space.rb", "lib/semantic/vector_space/builder.rb", "lib/semantic/vector_space/model.rb", "lib/semantic/version.rb", "resources/english.stop", "rsemantic.gemspec", "spec/semantic/compare_spec.rb", "spec/semantic/matrix_transformer_spec.rb", "spec/semantic/parser_spec.rb", "spec/semantic/search_spec.rb", "spec/semantic/transform/lsa_transform_spec.rb", "spec/semantic/transform/tf_idf_transform_spec.rb", "spec/semantic/vector_space/builder_spec.rb", "spec/semantic/vector_space/model_spec.rb", "spec/spec.opts", "spec/spec_helper.rb"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/josephwilk/rsemantic}
14
+ s.rdoc_options = ["--main", "README.txt"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{rsemantic}
17
+ s.rubygems_version = %q{1.2.0}
18
+ s.summary = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
19
+
20
+ if s.respond_to? :specification_version then
21
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
22
+ s.specification_version = 2
23
+
24
+ if current_version >= 3 then
25
+ s.add_runtime_dependency(%q<term-ansicolor>, [">= 1.0.3"])
26
+ s.add_runtime_dependency(%q<rspec>, [">= 1.1.5"])
27
+ s.add_runtime_dependency(%q<diff-lcs>, [">= 1.1.2"])
28
+ s.add_development_dependency(%q<hoe>, [">= 1.8.2"])
29
+ else
30
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
31
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
32
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
33
+ s.add_dependency(%q<hoe>, [">= 1.8.2"])
34
+ end
35
+ else
36
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
37
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
38
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
39
+ s.add_dependency(%q<hoe>, [">= 1.8.2"])
40
+ end
41
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Compare do
5
+
6
+ def vector(values)
7
+ Linalg::DMatrix.columns([values])
8
+ end
9
+
10
+ it "should calculate cosine" do
11
+ cosine = Compare.cosine( vector([0.1,0.5]), vector([0.9, 0.3]) )
12
+ cosine.should be_close(0.4961, 0.0001)
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,51 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe MatrixTransformer do
5
+
6
+ def mock_transform
7
+ @transform ||= mock(Transform)
8
+ end
9
+
10
+ def mock_vector_space
11
+ mock("vector space", :matrix => Linalg::DMatrix.rows([[1,0],[0,1]]), :matrix= => nil )
12
+ end
13
+
14
+
15
+ describe "transforming matrix" do
16
+
17
+ it "should ignore invalid transform class" do
18
+ matrix_transformer = MatrixTransformer.new(:transforms => [:FAKE])
19
+ lambda {
20
+ matrix_transformer.apply_transforms(mock_vector_space)
21
+ }.should_not raise_error
22
+ end
23
+
24
+ it "should use defaults transforms in none are specified" do
25
+ matrix_transformer = MatrixTransformer.new
26
+ Transform.should_receive(:const_get).with(:LSA).and_return(mock_transform)
27
+ Transform.should_receive(:const_get).with(:TFIDF).and_return(mock_transform)
28
+
29
+ matrix_transformer.apply_transforms(mock_vector_space)
30
+ end
31
+
32
+ it "should send transform message to class to transform matrix" do
33
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
34
+ Transform.stub!(:const_get).and_return(mock_transform)
35
+
36
+ mock_transform.should_receive(:transform)
37
+
38
+ matrix_transformer.apply_transforms(mock_vector_space)
39
+ end
40
+
41
+ it "should check that transform class is capable of transforming" do
42
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
43
+ Transform.stub!(:const_get).and_return(mock_transform)
44
+ mock_transform.should_receive(:respond_to?).with(:transform)
45
+
46
+ matrix_transformer.apply_transforms(mock_vector_space)
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,34 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Parser do
5
+
6
+ it "should remove stop words" do
7
+ file = mock("file")
8
+ file.stub!(:read).and_return("a to be")
9
+ File.stub!(:open).and_yield(file)
10
+ parser = Parser.new
11
+
12
+ parser.remove_stop_words(['a','house']).should == ['house']
13
+ end
14
+
15
+ it "should remove any non characters" do
16
+ file = mock("file")
17
+ file.stub!(:read).and_return("a to be")
18
+ File.stub!(:open).and_yield(file)
19
+
20
+ parser = Parser.new
21
+ parser.tokenise_and_stem("dragon.").should == ["dragon"]
22
+ end
23
+
24
+ it "should tokenise the string" do
25
+ parser = Parser.new
26
+
27
+ parser.stub!(:remove_stop_words).and_return(['mouse','trap'])
28
+ parser.should_receive(:tokenise_and_stem).and_return(['mouse','trap'])
29
+
30
+ parser.tokenise_and_filter(['the mouse trap'])
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,93 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Search do
5
+
6
+ documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
7
+
8
+ def mock_builder
9
+ @builder ||= mock(VectorSpace::Builder)
10
+ end
11
+
12
+ def mock_matrix_transformer
13
+ @matrix_transformer ||= mock(MatrixTransformer)
14
+ end
15
+
16
+ def query_vector
17
+ @query_vector ||= Linalg::DMatrix.columns([[1,0]])
18
+ end
19
+
20
+ def vector_space_model(stubs = {})
21
+ @vector_space_model ||= VectorSpace::Model.new(Linalg::DMatrix.rows([[0,1],[1,0]]), [])
22
+ end
23
+
24
+ def matrix(array)
25
+ Linalg::DMatrix.rows(array)
26
+ end
27
+
28
+ def vector(vector)
29
+ matrix([vector])
30
+ end
31
+
32
+ describe "setting up" do
33
+
34
+ it "should build the vector space" do
35
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
36
+ mock_builder.should_receive(:build_document_matrix).with(['test']).and_return(vector_space_model)
37
+
38
+ Search.new(['test'])
39
+ end
40
+
41
+ it "should transform matrices" do
42
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
43
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
44
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
45
+
46
+ #FIXME: with will not match vector_space_model, requests class Data. Think this is related to Delegate and Rspec
47
+ mock_matrix_transformer.should_receive(:apply_transforms).with(anything).and_return(vector_space_model)
48
+
49
+ Search.new(['test'])
50
+ end
51
+
52
+ end
53
+
54
+ describe "searching" do
55
+
56
+ it "should map search term to vector space" do
57
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
58
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
59
+
60
+ mock_builder.should_receive(:build_query_vector).with("cat").and_return(query_vector)
61
+
62
+ vector_search = Search.new(documents)
63
+ vector_search.search("cat")
64
+ end
65
+
66
+ it "should compare the documents using cosine" do
67
+ pending
68
+ end
69
+
70
+ end
71
+
72
+ describe "relating" do
73
+
74
+ it "should find related documents by comparing cosine" do
75
+ VectorSpace::Builder.stub!(:new).and_return(mock_builder)
76
+
77
+ mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
78
+
79
+ MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
80
+ mock_matrix_transformer.stub!(:apply_transforms).and_return(vector_space_model)
81
+
82
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[0],[1]]))
83
+ Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[1],[0]]))
84
+
85
+ vector_search = Search.new(documents)
86
+
87
+ vector_search.related(0)
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,59 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::LSA do
5
+
6
+ tiny_matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
7
+ [1.0, 0.0, 1.0]])
8
+
9
+ u = Linalg::DMatrix.rows([[1,0],
10
+ [0,1]])
11
+
12
+ vt = Linalg::DMatrix.rows([[1,0,0],
13
+ [1,0,0],
14
+ [1,0,0]])
15
+
16
+ sigma = Linalg::DMatrix.rows([[1,0,0],
17
+ [0,1,0]])
18
+
19
+ describe "latent semantic analysis transform" do
20
+
21
+ it "should use svd on matrix" do
22
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
23
+ [1.0, 0.0, 1.0]])
24
+
25
+ matrix.should_receive(:singular_value_decomposition).and_return([u, sigma, vt])
26
+
27
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
28
+
29
+ Transform::LSA.transform(matrix)
30
+ end
31
+
32
+ it "should reduce the noise in the sigma matrix" do
33
+ matrix = Linalg::DMatrix.columns([[0.0, 1.0, 0.0],
34
+ [1.0, 0.0, 1.0]])
35
+
36
+ matrix.stub!(:singular_value_decomposition).and_return([u, sigma, vt])
37
+ Linalg::DMatrix.stub!(:columns).and_return(matrix)
38
+
39
+ sigma.should_receive(:[]=).with(0,0,0)
40
+ sigma.should_receive(:[]=).with(1,1,0)
41
+
42
+ Transform::LSA.transform(matrix, 2)
43
+ end
44
+
45
+ it "should prevent reducing dimensions greater than the matrixes own dimensions" do
46
+ lambda { Transform::LSA.transform tiny_matrix, 100 }.should raise_error(Exception)
47
+ end
48
+
49
+ it "should transform LSA matrix" do
50
+ transformed_matrix = Transform::LSA.transform tiny_matrix
51
+
52
+ #TODO: better way to compare result matrix
53
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0,0,0],[1,0,1]]).to_s
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,35 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ describe Transform::TFIDF do
5
+
6
+ def matrix(matrix)
7
+ Linalg::DMatrix.rows(matrix)
8
+ end
9
+
10
+ tiny_matrix = Linalg::DMatrix.rows([[0.0, 1.0, 0.0],
11
+ [1.0, 0.0, 1.0]])
12
+
13
+ describe "term frequency / inverse document frequency transform" do
14
+
15
+ it "should find the number of times each term occurs" do
16
+ Transform::TFIDF.should_receive(:number_of_documents_with_term).with(0, matrix([[1]])).and_return(2)
17
+
18
+ Transform::TFIDF.transform(matrix([[1]]))
19
+ end
20
+
21
+ it "should ignore counting terms with 0 weighting" do
22
+ Transform::TFIDF.should_not_receive(:number_of_documents_with_term)
23
+
24
+ Transform::TFIDF.transform(matrix([[0,0],[0,0]]))
25
+ end
26
+
27
+ it "should calculate term frequency * inverse document freuency" do
28
+ transformed_matrix = Transform::TFIDF.transform matrix([[1,1],[0,1]])
29
+
30
+ transformed_matrix.to_s.should == Linalg::DMatrix.columns([[0, 0],[0, 0.346574]]).to_s
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+ describe Builder do
6
+
7
+ def mock_parser
8
+ @parser ||= mock("Parser")
9
+ end
10
+
11
+ def documents
12
+ ['nipon','ichiban']
13
+ end
14
+
15
+
16
+ describe "building query vector" do
17
+
18
+ it "should build vector from string" do
19
+ builder = Builder.new
20
+ builder.should_receive(:build_vector).with("query string")
21
+
22
+ builder.build_query_vector(["query","string"])
23
+ end
24
+
25
+ it "should generate a valid vector" do
26
+ builder = Builder.new
27
+ builder.build_document_matrix(["query string"])
28
+ query = builder.build_query_vector(["query","string"])
29
+
30
+ query.should == Linalg::DMatrix.columns([[1,1]])
31
+ end
32
+
33
+ it "should generate empty vector when terms are not in document matrix" do
34
+ builder = Builder.new
35
+ builder.build_document_matrix(["string"])
36
+ query = builder.build_query_vector(["not-in-document"])
37
+
38
+ query.should == Linalg::DMatrix.columns([[0]])
39
+ end
40
+
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,22 @@
1
+ require File.dirname(__FILE__) + '/../../spec_helper'
2
+
3
+ module Semantic
4
+ module VectorSpace
5
+
6
+ describe Model do
7
+
8
+ it "should output a DMatrix as a pretty string" do
9
+ model = Model.new(Linalg::DMatrix.columns([[0.11111,0.66666],[0.33333, 0.001]]), {})
10
+
11
+ model.to_s.should include("[ +0.11 +0.33 ]\n[ +0.67 +0.00 ]\n")
12
+ end
13
+
14
+ it "should output keywords for the matrix rows" do
15
+ model = Model.new(Linalg::DMatrix.columns([[0]]), {'shiva' => 0})
16
+
17
+ model.to_s.should include("shiva [ +0.00 ]")
18
+ end
19
+
20
+ end
21
+ end
22
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --diff
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ gem 'rspec'
3
+ require 'spec'
4
+
5
+ $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
6
+
7
+ require 'semantic'
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: josephwilk-semantic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Joseph Wilk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-13 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: term-ansicolor
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.3
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: rspec
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.1.5
32
+ version:
33
+ - !ruby/object:Gem::Dependency
34
+ name: diff-lcs
35
+ version_requirement:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.2
41
+ version:
42
+ - !ruby/object:Gem::Dependency
43
+ name: hoe
44
+ version_requirement:
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.8.2
50
+ version:
51
+ description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
52
+ email:
53
+ - josephwilk@joesniff.co.uk
54
+ executables: []
55
+
56
+ extensions: []
57
+
58
+ extra_rdoc_files:
59
+ - History.txt
60
+ - Manifest.txt
61
+ - README.txt
62
+ - TODO.txt
63
+ files:
64
+ - History.txt
65
+ - Manifest.txt
66
+ - README.txt
67
+ - Rakefile
68
+ - TODO.txt
69
+ - config/hoe.rb
70
+ - config/requirements.rb
71
+ - gem_tasks/deployment.rake
72
+ - gem_tasks/environment.rake
73
+ - gem_tasks/examples.rake
74
+ - gem_tasks/fix_cr_lf.rake
75
+ - gem_tasks/gemspec.rake
76
+ - gem_tasks/rspec.rake
77
+ - gem_tasks/website.rake
78
+ - lib/semantic.rb
79
+ - lib/semantic/compare.rb
80
+ - lib/semantic/matrix_transformer.rb
81
+ - lib/semantic/parser.rb
82
+ - lib/semantic/search.rb
83
+ - lib/semantic/transform.rb
84
+ - lib/semantic/transform/lsa_transform.rb
85
+ - lib/semantic/transform/tf_idf_transform.rb
86
+ - lib/semantic/vector_space.rb
87
+ - lib/semantic/vector_space/builder.rb
88
+ - lib/semantic/vector_space/model.rb
89
+ - lib/semantic/version.rb
90
+ - resources/english.stop
91
+ - rsemantic.gemspec
92
+ - spec/semantic/compare_spec.rb
93
+ - spec/semantic/matrix_transformer_spec.rb
94
+ - spec/semantic/parser_spec.rb
95
+ - spec/semantic/search_spec.rb
96
+ - spec/semantic/transform/lsa_transform_spec.rb
97
+ - spec/semantic/transform/tf_idf_transform_spec.rb
98
+ - spec/semantic/vector_space/builder_spec.rb
99
+ - spec/semantic/vector_space/model_spec.rb
100
+ - spec/spec.opts
101
+ - spec/spec_helper.rb
102
+ has_rdoc: true
103
+ homepage: http://github.com/josephwilk/rsemantic
104
+ post_install_message:
105
+ rdoc_options:
106
+ - --main
107
+ - README.txt
108
+ require_paths:
109
+ - lib
110
+ required_ruby_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: "0"
115
+ version:
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: "0"
121
+ version:
122
+ requirements: []
123
+
124
+ rubyforge_project: rsemantic
125
+ rubygems_version: 1.2.0
126
+ signing_key:
127
+ specification_version: 2
128
+ summary: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
129
+ test_files: []
130
+