rsemantic 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/{README.txt → README.md} +19 -10
  2. data/lib/semantic.rb +8 -5
  3. data/lib/semantic/compare.rb +4 -1
  4. data/lib/semantic/corpus.rb +61 -0
  5. data/lib/semantic/document.rb +39 -0
  6. data/lib/semantic/matrix_transformer.rb +4 -5
  7. data/lib/semantic/parser.rb +22 -10
  8. data/lib/semantic/search.rb +22 -16
  9. data/lib/semantic/search_result.rb +16 -0
  10. data/lib/semantic/transform/lsa_transform.rb +47 -22
  11. data/lib/semantic/transform/tf_idf_transform.rb +12 -23
  12. data/lib/semantic/vector_space/builder.rb +29 -22
  13. data/lib/semantic/vector_space/model.rb +14 -13
  14. data/lib/semantic/version.rb +1 -1
  15. data/lib/tasks/rspec.rake +13 -0
  16. metadata +75 -107
  17. data/Manifest.txt +0 -38
  18. data/Rakefile +0 -9
  19. data/config/hoe.rb +0 -69
  20. data/config/requirements.rb +0 -15
  21. data/gem_tasks/deployment.rake +0 -34
  22. data/gem_tasks/environment.rake +0 -7
  23. data/gem_tasks/examples.rake +0 -29
  24. data/gem_tasks/fix_cr_lf.rake +0 -10
  25. data/gem_tasks/gemspec.rake +0 -6
  26. data/gem_tasks/rspec.rake +0 -33
  27. data/gem_tasks/website.rake +0 -17
  28. data/rsemantic.gemspec +0 -41
  29. data/spec/semantic/compare_spec.rb +0 -16
  30. data/spec/semantic/matrix_transformer_spec.rb +0 -51
  31. data/spec/semantic/parser_spec.rb +0 -34
  32. data/spec/semantic/search_spec.rb +0 -129
  33. data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
  34. data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
  35. data/spec/semantic/vector_space/builder_spec.rb +0 -44
  36. data/spec/semantic/vector_space/model_spec.rb +0 -22
  37. data/spec/spec.opts +0 -2
  38. data/spec/spec_helper.rb +0 -7
data/Rakefile DELETED
@@ -1,9 +0,0 @@
1
- ENV['NODOT'] = 'true' # We don't want class diagrams in RDoc
2
- require 'config/requirements'
3
- require 'config/hoe' # setup Hoe + all gem configuration
4
-
5
- Dir['gem_tasks/**/*.rake'].each { |rake| load rake }
6
-
7
- # Hoe gives us :default => :test, but we don't have Test::Unit tests.
8
- Rake::Task[:default].clear_prerequisites
9
- task :default => [:spec]
data/config/hoe.rb DELETED
@@ -1,69 +0,0 @@
1
- require 'semantic/version'
2
-
3
- AUTHOR = 'Joseph Wilk' # can also be an array of Authors
4
- EMAIL = "josephwilk@joesniff.co.uk"
5
- DESCRIPTION = "A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency"
6
- GEM_NAME = 'rsemantic' # what ppl will type to install your gem
7
- HOMEPATH = "http://github.com/josephwilk/rsemantic"
8
- RUBYFORGE_PROJECT = 'rsemantic'
9
-
10
- @config_file = "~/.rubyforge/user-config.yml"
11
- @config = nil
12
- RUBYFORGE_USERNAME = "joseph_wilk"
13
- def rubyforge_username
14
- unless @config
15
- begin
16
- @config = YAML.load(File.read(File.expand_path(@config_file)))
17
- rescue
18
- puts <<-EOS
19
- ERROR: No rubyforge config file found: #{@config_file}
20
- Run 'rubyforge setup' to prepare your env for access to Rubyforge
21
- - See http://newgem.rubyforge.org/rubyforge.html for more details
22
- EOS
23
- exit
24
- end
25
- end
26
- RUBYFORGE_USERNAME.replace @config["username"]
27
- end
28
-
29
-
30
- REV = nil
31
- # UNCOMMENT IF REQUIRED:
32
- # REV = YAML.load(`svn info`)['Revision']
33
- VERS = Semantic::VERSION::STRING + (REV ? ".#{REV}" : "")
34
- RDOC_OPTS = ['--quiet', '--title', 'Rsemantic documentation',
35
- "--opname", "index.html",
36
- "--line-numbers",
37
- "--main", "README.textile",
38
- "--inline-source"]
39
-
40
- class Hoe
41
- def extra_deps
42
- @extra_deps.reject! { |x| Array(x).first == 'hoe' }
43
- @extra_deps
44
- end
45
- end
46
-
47
- # Generate all the Rake tasks
48
- # Run 'rake -T' to see list of generated tasks (from gem root directory)
49
- $hoe = Hoe.new(GEM_NAME, VERS) do |p|
50
- p.developer(AUTHOR, EMAIL)
51
- p.description = DESCRIPTION
52
- p.summary = DESCRIPTION
53
- p.url = HOMEPATH
54
- p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
55
- p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store', '**/*.class', '**/*.jar'] #An array of file patterns to delete on clean.
56
-
57
- # == Optional
58
- p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
59
- #p.extra_deps = [] # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
60
- p.extra_deps = [ ['term-ansicolor', '>= 1.0.3'], ['rspec', '>= 1.1.5'], ['diff-lcs', '>= 1.1.2'] ]
61
-
62
- #p.spec_extras = {} # A hash of extra values to set in the gemspec.
63
-
64
- end
65
-
66
- CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
67
- PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
68
- $hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
69
- $hoe.rsync_args = '-av --delete --ignore-errors'
@@ -1,15 +0,0 @@
1
- require 'fileutils'
2
- include FileUtils
3
-
4
- require 'rubygems'
5
- %w[rake hoe].each do |req_gem|
6
- begin
7
- require req_gem
8
- rescue LoadError
9
- puts "This Rakefile requires the '#{req_gem}' RubyGem."
10
- puts "Installation: gem install #{req_gem} -y"
11
- exit
12
- end
13
- end
14
-
15
- $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
@@ -1,34 +0,0 @@
1
- desc 'Release the website and new gem version'
2
- task :deploy => [:check_version, :website, :release] do
3
- puts "Remember to create SVN tag:"
4
- puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
5
- "svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
6
- puts "Suggested comment:"
7
- puts "Tagging release #{CHANGES}"
8
- end
9
-
10
- desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
11
- task :local_deploy => [:website_generate, :install_gem]
12
-
13
- task :check_version do
14
- unless ENV['VERSION']
15
- puts 'Must pass a VERSION=x.y.z release version'
16
- exit
17
- end
18
- unless ENV['VERSION'] == VERS
19
- puts "Please update your version.rb to match the release version, currently #{VERS}"
20
- exit
21
- end
22
- end
23
-
24
- desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
25
- task :install_gem_no_doc => [:clean, :package] do
26
- sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
27
- end
28
-
29
- namespace :manifest do
30
- desc 'Recreate Manifest.txt to include ALL files'
31
- task :refresh do
32
- `rake check_manifest | patch -p0 > Manifest.txt`
33
- end
34
- end
@@ -1,7 +0,0 @@
1
- task :ruby_env do
2
- RUBY_APP = if RUBY_PLATFORM =~ /java/
3
- "jruby"
4
- else
5
- "ruby"
6
- end unless defined? RUBY_APP
7
- end
@@ -1,29 +0,0 @@
1
- require 'lib/semantic'
2
-
3
- namespace :example do
4
-
5
- documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
6
-
7
- desc "run main LSA example"
8
- task :lsa do
9
- search = Semantic::Search.new(documents, :verbose => true)
10
- end
11
-
12
- desc "run main Vector space example"
13
- task :vector_space do
14
- search = Semantic::Search.new(documents)
15
-
16
- puts "Documents:"
17
- documents.each_with_index { |document, index| puts "#{index}: #{document}" }
18
- puts
19
-
20
- puts "Documents related to first document: #{documents[0]}"
21
- puts search.related(0)
22
- puts
23
-
24
- puts "Searching for the word cat:"
25
- puts search.search(["cat"])
26
- puts
27
- end
28
-
29
- end
@@ -1,10 +0,0 @@
1
- desc 'Make all files use UNIX (\n) line endings'
2
- task :fix_cr_lf do
3
- files = FileList['**/*']
4
- files.each do |f|
5
- next if File.directory?(f)
6
- s = IO.read(f)
7
- s.gsub!(/\r?\n/, "\n")
8
- File.open(f, "w") { |io| io.write(s) }
9
- end
10
- end
@@ -1,6 +0,0 @@
1
- namespace :gemspec do
2
- desc 'Refresh rsemantic.gemspec to include ALL files'
3
- task :refresh => 'manifest:refresh' do
4
- File.open('rsemantic.gemspec', 'w') {|io| io.write($hoe.spec.to_ruby)}
5
- end
6
- end
data/gem_tasks/rspec.rake DELETED
@@ -1,33 +0,0 @@
1
- begin
2
- require 'spec'
3
- rescue LoadError
4
- require 'rubygems'
5
- require 'spec'
6
- end
7
- begin
8
- require 'spec/rake/spectask'
9
- require 'spec/rake/verify_rcov'
10
- rescue LoadError
11
- puts <<-EOS
12
- To use rspec for testing you must install rspec gem:
13
- gem install rspec
14
- EOS
15
- exit(0)
16
- end
17
-
18
- desc "Run the specs under spec/models"
19
- Spec::Rake::SpecTask.new do |t|
20
- t.spec_opts = ['--options', "spec/spec.opts"]
21
- t.spec_files = FileList['spec/**/*_spec.rb']
22
-
23
- unless ENV['NO_RCOV']
24
- t.rcov = true
25
- t.rcov_dir = 'coverage'
26
- t.rcov_opts = ['--exclude', '_helper\.rb,_spec\.rb,spec\/boss,\/var\/lib\/gems,\/Library\/Ruby,\.autotest']
27
- end
28
- end
29
-
30
- RCov::VerifyTask.new(:verify_rcov => :spec) do |t|
31
- t.threshold = 99.7 # Make sure you have rcov 0.9 or higher!
32
- t.index_html = 'coverage/index.html'
33
- end
@@ -1,17 +0,0 @@
1
- desc 'Generate website files'
2
- task :website_generate => :ruby_env do
3
- (Dir['website/**/*.txt'] - Dir['website/version*.txt']).each do |txt|
4
- sh %{ #{RUBY_APP} script/txt2html #{txt} > #{txt.gsub(/txt$/,'html')} }
5
- end
6
- end
7
-
8
- desc 'Upload website files to rubyforge'
9
- task :website_upload do
10
- host = "#{rubyforge_username}@rubyforge.org"
11
- remote_dir = "/var/www/gforge-projects/#{PATH}/"
12
- local_dir = 'website'
13
- sh %{rsync -aCv #{local_dir}/ #{host}:#{remote_dir}}
14
- end
15
-
16
- desc 'Generate and upload website files'
17
- task :website => [:website_generate, :website_upload, :publish_docs]
data/rsemantic.gemspec DELETED
@@ -1,41 +0,0 @@
1
- Gem::Specification.new do |s|
2
- s.name = %q{rsemantic}
3
- s.version = "0.1.3"
4
-
5
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
6
- s.authors = ["Joseph Wilk"]
7
- s.date = %q{2009-08-01}
8
- s.description = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
9
- s.email = ["joe@josephwilk.net"]
10
- s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt", "TODO.txt"]
11
- s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile", "TODO.txt", "config/hoe.rb", "config/requirements.rb", "gem_tasks/deployment.rake", "gem_tasks/environment.rake", "gem_tasks/examples.rake", "gem_tasks/fix_cr_lf.rake", "gem_tasks/gemspec.rake", "gem_tasks/rspec.rake", "gem_tasks/website.rake", "lib/semantic.rb", "lib/semantic/compare.rb", "lib/semantic/matrix_transformer.rb", "lib/semantic/parser.rb", "lib/semantic/search.rb", "lib/semantic/transform.rb", "lib/semantic/transform/lsa_transform.rb", "lib/semantic/transform/tf_idf_transform.rb", "lib/semantic/vector_space.rb", "lib/semantic/vector_space/builder.rb", "lib/semantic/vector_space/model.rb", "lib/semantic/version.rb", "resources/english.stop", "rsemantic.gemspec", "spec/semantic/compare_spec.rb", "spec/semantic/matrix_transformer_spec.rb", "spec/semantic/parser_spec.rb", "spec/semantic/search_spec.rb", "spec/semantic/transform/lsa_transform_spec.rb", "spec/semantic/transform/tf_idf_transform_spec.rb", "spec/semantic/vector_space/builder_spec.rb", "spec/semantic/vector_space/model_spec.rb", "spec/spec.opts", "spec/spec_helper.rb"]
12
- s.has_rdoc = true
13
- s.homepage = %q{http://github.com/josephwilk/rsemantic}
14
- s.rdoc_options = ["--main", "README.txt"]
15
- s.require_paths = ["lib"]
16
- s.rubyforge_project = %q{rsemantic}
17
- s.rubygems_version = %q{1.3.1}
18
- s.summary = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
19
-
20
- if s.respond_to? :specification_version then
21
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
22
- s.specification_version = 2
23
-
24
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
25
- s.add_runtime_dependency(%q<term-ansicolor>, [">= 1.0.3"])
26
- s.add_runtime_dependency(%q<rspec>, [">= 1.1.5"])
27
- s.add_runtime_dependency(%q<diff-lcs>, [">= 1.1.2"])
28
- s.add_development_dependency(%q<hoe>, [">= 2.3.2"])
29
- else
30
- s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
31
- s.add_dependency(%q<rspec>, [">= 1.1.5"])
32
- s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
33
- s.add_dependency(%q<hoe>, [">= 2.3.2"])
34
- end
35
- else
36
- s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
37
- s.add_dependency(%q<rspec>, [">= 1.1.5"])
38
- s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
39
- s.add_dependency(%q<hoe>, [">= 2.3.2"])
40
- end
41
- end
@@ -1,16 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper'
2
-
3
- module Semantic
4
- describe Compare do
5
-
6
- def vector(values)
7
- Linalg::DMatrix.columns([values])
8
- end
9
-
10
- it "should calculate cosine" do
11
- cosine = Compare.cosine( vector([0.1,0.5]), vector([0.9, 0.3]) )
12
- cosine.should be_close(0.4961, 0.0001)
13
- end
14
-
15
- end
16
- end
@@ -1,51 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper'
2
-
3
- module Semantic
4
- describe MatrixTransformer do
5
-
6
- def mock_transform
7
- @transform ||= mock(Transform)
8
- end
9
-
10
- def mock_vector_space
11
- mock("vector space", :matrix => Linalg::DMatrix.rows([[1,0],[0,1]]), :matrix= => nil )
12
- end
13
-
14
-
15
- describe "transforming matrix" do
16
-
17
- it "should ignore invalid transform class" do
18
- matrix_transformer = MatrixTransformer.new(:transforms => [:FAKE])
19
- lambda {
20
- matrix_transformer.apply_transforms(mock_vector_space)
21
- }.should_not raise_error
22
- end
23
-
24
- it "should use defaults transforms in none are specified" do
25
- matrix_transformer = MatrixTransformer.new
26
- Transform.should_receive(:const_get).with(:LSA).and_return(mock_transform)
27
- Transform.should_receive(:const_get).with(:TFIDF).and_return(mock_transform)
28
-
29
- matrix_transformer.apply_transforms(mock_vector_space)
30
- end
31
-
32
- it "should send transform message to class to transform matrix" do
33
- matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
34
- Transform.stub!(:const_get).and_return(mock_transform)
35
-
36
- mock_transform.should_receive(:transform)
37
-
38
- matrix_transformer.apply_transforms(mock_vector_space)
39
- end
40
-
41
- it "should check that transform class is capable of transforming" do
42
- matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
43
- Transform.stub!(:const_get).and_return(mock_transform)
44
- mock_transform.should_receive(:respond_to?).with(:transform)
45
-
46
- matrix_transformer.apply_transforms(mock_vector_space)
47
- end
48
-
49
- end
50
- end
51
- end
@@ -1,34 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper'
2
-
3
- module Semantic
4
- describe Parser do
5
-
6
- it "should remove stop words" do
7
- file = mock("file")
8
- file.stub!(:read).and_return("a to be")
9
- File.stub!(:open).and_yield(file)
10
- parser = Parser.new
11
-
12
- parser.remove_stop_words(['a','house']).should == ['house']
13
- end
14
-
15
- it "should remove any non characters" do
16
- file = mock("file")
17
- file.stub!(:read).and_return("a to be")
18
- File.stub!(:open).and_yield(file)
19
-
20
- parser = Parser.new
21
- parser.tokenise_and_stem("dragon.").should == ["dragon"]
22
- end
23
-
24
- it "should tokenise the string" do
25
- parser = Parser.new
26
-
27
- parser.stub!(:remove_stop_words).and_return(['mouse','trap'])
28
- parser.should_receive(:tokenise_and_stem).and_return(['mouse','trap'])
29
-
30
- parser.tokenise_and_filter(['the mouse trap'])
31
- end
32
-
33
- end
34
- end
@@ -1,129 +0,0 @@
1
- require File.dirname(__FILE__) + '/../spec_helper'
2
-
3
- module Semantic
4
- describe Search do
5
-
6
- documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
7
-
8
- def mock_builder
9
- @builder ||= mock(VectorSpace::Builder)
10
- end
11
-
12
- def mock_matrix_transformer
13
- @matrix_transformer ||= mock(MatrixTransformer)
14
- end
15
-
16
- def query_vector
17
- @query_vector ||= Linalg::DMatrix.columns([[1,0]])
18
- end
19
-
20
- def vector_space_model(stubs = {})
21
- @vector_space_model ||= VectorSpace::Model.new(Linalg::DMatrix.rows([[0,1],[1,0]]), {})
22
- end
23
-
24
- def matrix(array)
25
- Linalg::DMatrix.rows(array)
26
- end
27
-
28
- def vector(vector)
29
- matrix([vector])
30
- end
31
-
32
- describe "setting up" do
33
-
34
- it "should build the vector space" do
35
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
36
- mock_builder.should_receive(:build_document_matrix).with(['test']).and_return(vector_space_model)
37
-
38
- Search.new(['test'])
39
- end
40
-
41
- it "should transform matrices" do
42
- MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
43
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
44
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
45
-
46
- #FIXME: with will not match vector_space_model, requests class Data. Think this is related to Delegate and Rspec
47
- mock_matrix_transformer.should_receive(:apply_transforms).with(anything).and_return(vector_space_model)
48
-
49
- Search.new(['test'])
50
- end
51
-
52
- end
53
-
54
- describe "searching" do
55
-
56
- it "should map search term to vector space" do
57
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
58
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
59
-
60
- mock_builder.should_receive(:build_query_vector).with("cat").and_return(query_vector)
61
-
62
- vector_search = Search.new(documents)
63
- vector_search.search("cat")
64
- end
65
-
66
- it "should compare the documents using cosine" do
67
- pending
68
- end
69
-
70
- end
71
-
72
- describe "relating" do
73
-
74
- it "should find related documents by comparing cosine" do
75
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
76
-
77
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
78
-
79
- MatrixTransformer.stub!(:new).and_return(mock_matrix_transformer)
80
- mock_matrix_transformer.stub!(:apply_transforms).and_return(vector_space_model)
81
-
82
- Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[0],[1]]))
83
- Compare.should_receive(:cosine).with(matrix([[0],[1]]), matrix([[1],[0]]))
84
-
85
- vector_search = Search.new(documents)
86
-
87
- vector_search.related(0)
88
- end
89
-
90
- end
91
-
92
- describe "logging" do
93
-
94
- before(:each) do
95
- @out = StringIO.new
96
- Semantic.logger = Logger.new(@out)
97
- end
98
-
99
- it "should set info level if in verbose mode" do
100
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
101
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
102
-
103
- Search.new(['test'], :verbose => true)
104
-
105
- Semantic.logger.level.should == Logger::INFO
106
- end
107
-
108
- it "should set error level if not in verbose mode" do
109
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
110
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
111
-
112
- Search.new(['test'], :verbose => false)
113
-
114
- Semantic.logger.level.should == Logger::ERROR
115
- end
116
-
117
- it "should default to error level if verbose is not specified" do
118
- VectorSpace::Builder.stub!(:new).and_return(mock_builder)
119
- mock_builder.stub!(:build_document_matrix).and_return(vector_space_model)
120
-
121
- Search.new(['test'])
122
-
123
- Semantic.logger.level.should == Logger::ERROR
124
- end
125
-
126
- end
127
-
128
- end
129
- end