rsemantic 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +19 -0
- data/Manifest.txt +38 -0
- data/README.txt +48 -0
- data/Rakefile +9 -0
- data/TODO.txt +9 -0
- data/config/hoe.rb +69 -0
- data/config/requirements.rb +15 -0
- data/gem_tasks/deployment.rake +34 -0
- data/gem_tasks/environment.rake +7 -0
- data/gem_tasks/examples.rake +29 -0
- data/gem_tasks/fix_cr_lf.rake +10 -0
- data/gem_tasks/gemspec.rake +6 -0
- data/gem_tasks/rspec.rake +33 -0
- data/gem_tasks/website.rake +17 -0
- data/lib/semantic.rb +33 -0
- data/lib/semantic/compare.rb +19 -0
- data/lib/semantic/matrix_transformer.rb +25 -0
- data/lib/semantic/parser.rb +40 -0
- data/lib/semantic/search.rb +35 -0
- data/lib/semantic/transform.rb +1 -0
- data/lib/semantic/transform/lsa_transform.rb +42 -0
- data/lib/semantic/transform/tf_idf_transform.rb +42 -0
- data/lib/semantic/vector_space.rb +1 -0
- data/lib/semantic/vector_space/builder.rb +69 -0
- data/lib/semantic/vector_space/model.rb +47 -0
- data/lib/semantic/version.rb +9 -0
- data/resources/english.stop +571 -0
- data/rsemantic.gemspec +41 -0
- data/spec/semantic/compare_spec.rb +16 -0
- data/spec/semantic/matrix_transformer_spec.rb +51 -0
- data/spec/semantic/parser_spec.rb +34 -0
- data/spec/semantic/search_spec.rb +129 -0
- data/spec/semantic/transform/lsa_transform_spec.rb +59 -0
- data/spec/semantic/transform/tf_idf_transform_spec.rb +35 -0
- data/spec/semantic/vector_space/builder_spec.rb +44 -0
- data/spec/semantic/vector_space/model_spec.rb +22 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +7 -0
- metadata +136 -0
data/History.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
== 0.1.3
|
2
|
+
|
3
|
+
= Bugs
|
4
|
+
* Fixed bug with keyword formatting
|
5
|
+
|
6
|
+
== 0.1.2
|
7
|
+
|
8
|
+
= Bugs
|
9
|
+
* Fixed problem where LSA was raising errors when trying to reduce dimensions of non-square matrixes. (Joseph Wilk)
|
10
|
+
|
11
|
+
== 0.1.1
|
12
|
+
|
13
|
+
= Bugs
|
14
|
+
* fixed a bug where verbose mode was getting stuck at INFO level and would never change (Joseph Wilk)
|
15
|
+
|
16
|
+
== 0.1.0
|
17
|
+
|
18
|
+
* Changed internal representation of vector space. Using columns as documents and rows as terms. This is more consistent which LSA research papers. (Joseph Wilk)
|
19
|
+
* Wrap DMatrix in VectorSpace::Model, allowing us to store keywords with the matrix and get pretty output (Joseph Wilk)
|
data/Manifest.txt
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README.txt
|
4
|
+
Rakefile
|
5
|
+
TODO.txt
|
6
|
+
config/hoe.rb
|
7
|
+
config/requirements.rb
|
8
|
+
gem_tasks/deployment.rake
|
9
|
+
gem_tasks/environment.rake
|
10
|
+
gem_tasks/examples.rake
|
11
|
+
gem_tasks/fix_cr_lf.rake
|
12
|
+
gem_tasks/gemspec.rake
|
13
|
+
gem_tasks/rspec.rake
|
14
|
+
gem_tasks/website.rake
|
15
|
+
lib/semantic.rb
|
16
|
+
lib/semantic/compare.rb
|
17
|
+
lib/semantic/matrix_transformer.rb
|
18
|
+
lib/semantic/parser.rb
|
19
|
+
lib/semantic/search.rb
|
20
|
+
lib/semantic/transform.rb
|
21
|
+
lib/semantic/transform/lsa_transform.rb
|
22
|
+
lib/semantic/transform/tf_idf_transform.rb
|
23
|
+
lib/semantic/vector_space.rb
|
24
|
+
lib/semantic/vector_space/builder.rb
|
25
|
+
lib/semantic/vector_space/model.rb
|
26
|
+
lib/semantic/version.rb
|
27
|
+
resources/english.stop
|
28
|
+
rsemantic.gemspec
|
29
|
+
spec/semantic/compare_spec.rb
|
30
|
+
spec/semantic/matrix_transformer_spec.rb
|
31
|
+
spec/semantic/parser_spec.rb
|
32
|
+
spec/semantic/search_spec.rb
|
33
|
+
spec/semantic/transform/lsa_transform_spec.rb
|
34
|
+
spec/semantic/transform/tf_idf_transform_spec.rb
|
35
|
+
spec/semantic/vector_space/builder_spec.rb
|
36
|
+
spec/semantic/vector_space/model_spec.rb
|
37
|
+
spec/spec.opts
|
38
|
+
spec/spec_helper.rb
|
data/README.txt
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= Rsemantic
|
2
|
+
|
3
|
+
* http://github.com/josephwilk/rsemantic
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
A Ruby document vector search with flexible matrix transforms. Current supported transforms:
|
8
|
+
|
9
|
+
* Latent semantic analysis
|
10
|
+
* Term frequency - inverse document frequency
|
11
|
+
|
12
|
+
Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
13
|
+
|
14
|
+
== REQUIREMENTS:
|
15
|
+
|
16
|
+
* Linalg - http://rubyforge.org/projects/linalg/
|
17
|
+
* stemmer - http://rubyforge.org/projects/stemmer/
|
18
|
+
|
19
|
+
== INSTALL:
|
20
|
+
|
21
|
+
* git clone git://github.com/josephwilk/rsemantic.git
|
22
|
+
|
23
|
+
== LICENSE
|
24
|
+
|
25
|
+
(The MIT License)
|
26
|
+
|
27
|
+
Copyright (c) 2008 Joseph Wilk
|
28
|
+
|
29
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
30
|
+
a copy of this software and associated documentation files (the
|
31
|
+
'Software'), to deal in the Software without restriction, including
|
32
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
33
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
34
|
+
permit persons to whom the Software is furnished to do so, subject to
|
35
|
+
the following conditions:
|
36
|
+
|
37
|
+
The above copyright notice and this permission notice shall be
|
38
|
+
included in all copies or substantial portions of the Software.
|
39
|
+
|
40
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
41
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
42
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
43
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
44
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
45
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
46
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
47
|
+
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
ENV['NODOT'] = 'true' # We don't want class diagrams in RDoc
|
2
|
+
require 'config/requirements'
|
3
|
+
require 'config/hoe' # setup Hoe + all gem configuration
|
4
|
+
|
5
|
+
Dir['gem_tasks/**/*.rake'].each { |rake| load rake }
|
6
|
+
|
7
|
+
# Hoe gives us :default => :test, but we don't have Test::Unit tests.
|
8
|
+
Rake::Task[:default].clear_prerequisites
|
9
|
+
task :default => [:spec]
|
data/TODO.txt
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
== FEATURES/PROBLEMS:
|
2
|
+
|
3
|
+
* Applying transforms to query vectors
|
4
|
+
* Detect the optimal dimension reduction in LSA.
|
5
|
+
* Allow objects to be passed in as transforms.
|
6
|
+
* Implement Probabilistic latent semantic analysis
|
7
|
+
* Implement Latent Dirichlet Allocation
|
8
|
+
|
9
|
+
* Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
|
data/config/hoe.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'semantic/version'
|
2
|
+
|
3
|
+
AUTHOR = 'Joseph Wilk' # can also be an array of Authors
|
4
|
+
EMAIL = "josephwilk@joesniff.co.uk"
|
5
|
+
DESCRIPTION = "A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency"
|
6
|
+
GEM_NAME = 'rsemantic' # what ppl will type to install your gem
|
7
|
+
HOMEPATH = "http://github.com/josephwilk/rsemantic"
|
8
|
+
RUBYFORGE_PROJECT = 'rsemantic'
|
9
|
+
|
10
|
+
@config_file = "~/.rubyforge/user-config.yml"
|
11
|
+
@config = nil
|
12
|
+
RUBYFORGE_USERNAME = "joseph_wilk"
|
13
|
+
def rubyforge_username
|
14
|
+
unless @config
|
15
|
+
begin
|
16
|
+
@config = YAML.load(File.read(File.expand_path(@config_file)))
|
17
|
+
rescue
|
18
|
+
puts <<-EOS
|
19
|
+
ERROR: No rubyforge config file found: #{@config_file}
|
20
|
+
Run 'rubyforge setup' to prepare your env for access to Rubyforge
|
21
|
+
- See http://newgem.rubyforge.org/rubyforge.html for more details
|
22
|
+
EOS
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
end
|
26
|
+
RUBYFORGE_USERNAME.replace @config["username"]
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
REV = nil
|
31
|
+
# UNCOMMENT IF REQUIRED:
|
32
|
+
# REV = YAML.load(`svn info`)['Revision']
|
33
|
+
VERS = Semantic::VERSION::STRING + (REV ? ".#{REV}" : "")
|
34
|
+
RDOC_OPTS = ['--quiet', '--title', 'Rsemantic documentation',
|
35
|
+
"--opname", "index.html",
|
36
|
+
"--line-numbers",
|
37
|
+
"--main", "README.textile",
|
38
|
+
"--inline-source"]
|
39
|
+
|
40
|
+
class Hoe
|
41
|
+
def extra_deps
|
42
|
+
@extra_deps.reject! { |x| Array(x).first == 'hoe' }
|
43
|
+
@extra_deps
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Generate all the Rake tasks
|
48
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
49
|
+
$hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
50
|
+
p.developer(AUTHOR, EMAIL)
|
51
|
+
p.description = DESCRIPTION
|
52
|
+
p.summary = DESCRIPTION
|
53
|
+
p.url = HOMEPATH
|
54
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
55
|
+
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store', '**/*.class', '**/*.jar'] #An array of file patterns to delete on clean.
|
56
|
+
|
57
|
+
# == Optional
|
58
|
+
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
59
|
+
#p.extra_deps = [] # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
|
60
|
+
p.extra_deps = [ ['term-ansicolor', '>= 1.0.3'], ['rspec', '>= 1.1.5'], ['diff-lcs', '>= 1.1.2'] ]
|
61
|
+
|
62
|
+
#p.spec_extras = {} # A hash of extra values to set in the gemspec.
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
|
67
|
+
PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
|
68
|
+
$hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
|
69
|
+
$hoe.rsync_args = '-av --delete --ignore-errors'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
include FileUtils
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
%w[rake hoe].each do |req_gem|
|
6
|
+
begin
|
7
|
+
require req_gem
|
8
|
+
rescue LoadError
|
9
|
+
puts "This Rakefile requires the '#{req_gem}' RubyGem."
|
10
|
+
puts "Installation: gem install #{req_gem} -y"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
|
@@ -0,0 +1,34 @@
|
|
1
|
+
desc 'Release the website and new gem version'
|
2
|
+
task :deploy => [:check_version, :website, :release] do
|
3
|
+
puts "Remember to create SVN tag:"
|
4
|
+
puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
|
5
|
+
"svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
|
6
|
+
puts "Suggested comment:"
|
7
|
+
puts "Tagging release #{CHANGES}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
|
11
|
+
task :local_deploy => [:website_generate, :install_gem]
|
12
|
+
|
13
|
+
task :check_version do
|
14
|
+
unless ENV['VERSION']
|
15
|
+
puts 'Must pass a VERSION=x.y.z release version'
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
unless ENV['VERSION'] == VERS
|
19
|
+
puts "Please update your version.rb to match the release version, currently #{VERS}"
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
|
25
|
+
task :install_gem_no_doc => [:clean, :package] do
|
26
|
+
sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
|
27
|
+
end
|
28
|
+
|
29
|
+
namespace :manifest do
|
30
|
+
desc 'Recreate Manifest.txt to include ALL files'
|
31
|
+
task :refresh do
|
32
|
+
`rake check_manifest | patch -p0 > Manifest.txt`
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'lib/semantic'
|
2
|
+
|
3
|
+
namespace :example do
|
4
|
+
|
5
|
+
documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
|
6
|
+
|
7
|
+
desc "run main LSA example"
|
8
|
+
task :lsa do
|
9
|
+
search = Semantic::Search.new(documents, :verbose => true)
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "run main Vector space example"
|
13
|
+
task :vector_space do
|
14
|
+
search = Semantic::Search.new(documents)
|
15
|
+
|
16
|
+
puts "Documents:"
|
17
|
+
documents.each_with_index { |document, index| puts "#{index}: #{document}" }
|
18
|
+
puts
|
19
|
+
|
20
|
+
puts "Documents related to first document: #{documents[0]}"
|
21
|
+
puts search.related(0)
|
22
|
+
puts
|
23
|
+
|
24
|
+
puts "Searching for the word cat:"
|
25
|
+
puts search.search(["cat"])
|
26
|
+
puts
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'spec'
|
6
|
+
end
|
7
|
+
begin
|
8
|
+
require 'spec/rake/spectask'
|
9
|
+
require 'spec/rake/verify_rcov'
|
10
|
+
rescue LoadError
|
11
|
+
puts <<-EOS
|
12
|
+
To use rspec for testing you must install rspec gem:
|
13
|
+
gem install rspec
|
14
|
+
EOS
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
desc "Run the specs under spec/models"
|
19
|
+
Spec::Rake::SpecTask.new do |t|
|
20
|
+
t.spec_opts = ['--options', "spec/spec.opts"]
|
21
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
22
|
+
|
23
|
+
unless ENV['NO_RCOV']
|
24
|
+
t.rcov = true
|
25
|
+
t.rcov_dir = 'coverage'
|
26
|
+
t.rcov_opts = ['--exclude', '_helper\.rb,_spec\.rb,spec\/boss,\/var\/lib\/gems,\/Library\/Ruby,\.autotest']
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
RCov::VerifyTask.new(:verify_rcov => :spec) do |t|
|
31
|
+
t.threshold = 99.7 # Make sure you have rcov 0.9 or higher!
|
32
|
+
t.index_html = 'coverage/index.html'
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
desc 'Generate website files'
|
2
|
+
task :website_generate => :ruby_env do
|
3
|
+
(Dir['website/**/*.txt'] - Dir['website/version*.txt']).each do |txt|
|
4
|
+
sh %{ #{RUBY_APP} script/txt2html #{txt} > #{txt.gsub(/txt$/,'html')} }
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
desc 'Upload website files to rubyforge'
|
9
|
+
task :website_upload do
|
10
|
+
host = "#{rubyforge_username}@rubyforge.org"
|
11
|
+
remote_dir = "/var/www/gforge-projects/#{PATH}/"
|
12
|
+
local_dir = 'website'
|
13
|
+
sh %{rsync -aCv #{local_dir}/ #{host}:#{remote_dir}}
|
14
|
+
end
|
15
|
+
|
16
|
+
desc 'Generate and upload website files'
|
17
|
+
task :website => [:website_generate, :website_upload, :publish_docs]
|
data/lib/semantic.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require "semantic/vector_space"
|
5
|
+
require "semantic/compare"
|
6
|
+
require "semantic/parser"
|
7
|
+
require "semantic/matrix_transformer"
|
8
|
+
require "semantic/search"
|
9
|
+
require "semantic/transform"
|
10
|
+
require "semantic/version"
|
11
|
+
|
12
|
+
require 'rubygems'
|
13
|
+
require 'linalg'
|
14
|
+
#http://rubyforge.org/projects/stemmer/
|
15
|
+
#A processor for removing the commoner morphological and inflexional endings from words in English
|
16
|
+
require 'stemmer'
|
17
|
+
require 'logger'
|
18
|
+
|
19
|
+
module Semantic
|
20
|
+
|
21
|
+
class << self
|
22
|
+
attr_writer :logger
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.logger
|
26
|
+
return @logger if @logger
|
27
|
+
@logger = Logger.new(STDOUT)
|
28
|
+
@logger.formatter = proc { |severity, time, progname, msg| "#{msg}\n" }
|
29
|
+
@logger.level = Logger::ERROR
|
30
|
+
@logger
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Semantic
|
2
|
+
class Compare
|
3
|
+
|
4
|
+
class << self
|
5
|
+
|
6
|
+
def similarity(vector1, vector2)
|
7
|
+
cosine(vector1, vector2)
|
8
|
+
end
|
9
|
+
|
10
|
+
def cosine(vector1, vector2)
|
11
|
+
unless vector2.nil? or vector1.nil?
|
12
|
+
(vector2.dot(vector1)) / (vector1.norm * vector2.norm)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Semantic
|
2
|
+
class MatrixTransformer
|
3
|
+
|
4
|
+
def initialize(options={})
|
5
|
+
@transforms = options[:transforms] || [:TFIDF, :LSA]
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
def apply_transforms(vector_space_model)
|
10
|
+
@transforms.each do |transform|
|
11
|
+
begin
|
12
|
+
transform_class = Semantic::Transform.const_get(transform)
|
13
|
+
Semantic.logger.info("Applying #{transform} transform")
|
14
|
+
vector_space_model.matrix = transform_class.send(:transform, vector_space_model.matrix) if transform_class.respond_to?(:transform)
|
15
|
+
Semantic.logger.info(vector_space_model)
|
16
|
+
rescue Exception => e
|
17
|
+
Semantic.logger.error("Error: Cannot perform transform: #{transform}")
|
18
|
+
Semantic.logger.error(e)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
vector_space_model
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|