rsemantic 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,19 @@
1
+ == 0.1.3
2
+
3
+ = Bugs
4
+ * Fixed bug with keyword formatting
5
+
6
+ == 0.1.2
7
+
8
+ = Bugs
9
+ * Fixed problem where LSA was raising errors when trying to reduce dimensions of non-square matrixes. (Joseph Wilk)
10
+
11
+ == 0.1.1
12
+
13
+ = Bugs
14
+ * fixed a bug where verbose mode was getting stuck at INFO level and would never change (Joseph Wilk)
15
+
16
+ == 0.1.0
17
+
18
+ * Changed internal representation of vector space. Using columns as documents and rows as terms. This is more consistent which LSA research papers. (Joseph Wilk)
19
+ * Wrap DMatrix in VectorSpace::Model, allowing us to store keywords with the matrix and get pretty output (Joseph Wilk)
data/Manifest.txt ADDED
@@ -0,0 +1,38 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ TODO.txt
6
+ config/hoe.rb
7
+ config/requirements.rb
8
+ gem_tasks/deployment.rake
9
+ gem_tasks/environment.rake
10
+ gem_tasks/examples.rake
11
+ gem_tasks/fix_cr_lf.rake
12
+ gem_tasks/gemspec.rake
13
+ gem_tasks/rspec.rake
14
+ gem_tasks/website.rake
15
+ lib/semantic.rb
16
+ lib/semantic/compare.rb
17
+ lib/semantic/matrix_transformer.rb
18
+ lib/semantic/parser.rb
19
+ lib/semantic/search.rb
20
+ lib/semantic/transform.rb
21
+ lib/semantic/transform/lsa_transform.rb
22
+ lib/semantic/transform/tf_idf_transform.rb
23
+ lib/semantic/vector_space.rb
24
+ lib/semantic/vector_space/builder.rb
25
+ lib/semantic/vector_space/model.rb
26
+ lib/semantic/version.rb
27
+ resources/english.stop
28
+ rsemantic.gemspec
29
+ spec/semantic/compare_spec.rb
30
+ spec/semantic/matrix_transformer_spec.rb
31
+ spec/semantic/parser_spec.rb
32
+ spec/semantic/search_spec.rb
33
+ spec/semantic/transform/lsa_transform_spec.rb
34
+ spec/semantic/transform/tf_idf_transform_spec.rb
35
+ spec/semantic/vector_space/builder_spec.rb
36
+ spec/semantic/vector_space/model_spec.rb
37
+ spec/spec.opts
38
+ spec/spec_helper.rb
data/README.txt ADDED
@@ -0,0 +1,48 @@
1
+ = Rsemantic
2
+
3
+ * http://github.com/josephwilk/rsemantic
4
+
5
+ == DESCRIPTION:
6
+
7
+ A Ruby document vector search with flexible matrix transforms. Current supported transforms:
8
+
9
+ * Latent semantic analysis
10
+ * Term frequency - inverse document frequency
11
+
12
+ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
13
+
14
+ == REQUIREMENTS:
15
+
16
+ * Linalg - http://rubyforge.org/projects/linalg/
17
+ * stemmer - http://rubyforge.org/projects/stemmer/
18
+
19
+ == INSTALL:
20
+
21
+ * git clone git://github.com/josephwilk/rsemantic.git
22
+
23
+ == LICENSE
24
+
25
+ (The MIT License)
26
+
27
+ Copyright (c) 2008 Joseph Wilk
28
+
29
+ Permission is hereby granted, free of charge, to any person obtaining
30
+ a copy of this software and associated documentation files (the
31
+ 'Software'), to deal in the Software without restriction, including
32
+ without limitation the rights to use, copy, modify, merge, publish,
33
+ distribute, sublicense, and/or sell copies of the Software, and to
34
+ permit persons to whom the Software is furnished to do so, subject to
35
+ the following conditions:
36
+
37
+ The above copyright notice and this permission notice shall be
38
+ included in all copies or substantial portions of the Software.
39
+
40
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
41
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
44
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
45
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
46
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
47
+
48
+
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ ENV['NODOT'] = 'true' # We don't want class diagrams in RDoc
2
+ require 'config/requirements'
3
+ require 'config/hoe' # setup Hoe + all gem configuration
4
+
5
+ Dir['gem_tasks/**/*.rake'].each { |rake| load rake }
6
+
7
+ # Hoe gives us :default => :test, but we don't have Test::Unit tests.
8
+ Rake::Task[:default].clear_prerequisites
9
+ task :default => [:spec]
data/TODO.txt ADDED
@@ -0,0 +1,9 @@
1
+ == FEATURES/PROBLEMS:
2
+
3
+ * Applying transforms to query vectors
4
+ * Detect the optimal dimension reduction in LSA.
5
+ * Allow objects to be passed in as transforms.
6
+ * Implement Probabilistic latent semantic analysis
7
+ * Implement Latent Dirichlet Allocation
8
+
9
+ * Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
data/config/hoe.rb ADDED
@@ -0,0 +1,69 @@
1
+ require 'semantic/version'
2
+
3
+ AUTHOR = 'Joseph Wilk' # can also be an array of Authors
4
+ EMAIL = "josephwilk@joesniff.co.uk"
5
+ DESCRIPTION = "A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency"
6
+ GEM_NAME = 'rsemantic' # what ppl will type to install your gem
7
+ HOMEPATH = "http://github.com/josephwilk/rsemantic"
8
+ RUBYFORGE_PROJECT = 'rsemantic'
9
+
10
+ @config_file = "~/.rubyforge/user-config.yml"
11
+ @config = nil
12
+ RUBYFORGE_USERNAME = "joseph_wilk"
13
+ def rubyforge_username
14
+ unless @config
15
+ begin
16
+ @config = YAML.load(File.read(File.expand_path(@config_file)))
17
+ rescue
18
+ puts <<-EOS
19
+ ERROR: No rubyforge config file found: #{@config_file}
20
+ Run 'rubyforge setup' to prepare your env for access to Rubyforge
21
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
22
+ EOS
23
+ exit
24
+ end
25
+ end
26
+ RUBYFORGE_USERNAME.replace @config["username"]
27
+ end
28
+
29
+
30
+ REV = nil
31
+ # UNCOMMENT IF REQUIRED:
32
+ # REV = YAML.load(`svn info`)['Revision']
33
+ VERS = Semantic::VERSION::STRING + (REV ? ".#{REV}" : "")
34
+ RDOC_OPTS = ['--quiet', '--title', 'Rsemantic documentation',
35
+ "--opname", "index.html",
36
+ "--line-numbers",
37
+ "--main", "README.textile",
38
+ "--inline-source"]
39
+
40
+ class Hoe
41
+ def extra_deps
42
+ @extra_deps.reject! { |x| Array(x).first == 'hoe' }
43
+ @extra_deps
44
+ end
45
+ end
46
+
47
+ # Generate all the Rake tasks
48
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
49
+ $hoe = Hoe.new(GEM_NAME, VERS) do |p|
50
+ p.developer(AUTHOR, EMAIL)
51
+ p.description = DESCRIPTION
52
+ p.summary = DESCRIPTION
53
+ p.url = HOMEPATH
54
+ p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
55
+ p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store', '**/*.class', '**/*.jar'] #An array of file patterns to delete on clean.
56
+
57
+ # == Optional
58
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
59
+ #p.extra_deps = [] # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
60
+ p.extra_deps = [ ['term-ansicolor', '>= 1.0.3'], ['rspec', '>= 1.1.5'], ['diff-lcs', '>= 1.1.2'] ]
61
+
62
+ #p.spec_extras = {} # A hash of extra values to set in the gemspec.
63
+
64
+ end
65
+
66
+ CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
67
+ PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
68
+ $hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
69
+ $hoe.rsync_args = '-av --delete --ignore-errors'
@@ -0,0 +1,15 @@
1
+ require 'fileutils'
2
+ include FileUtils
3
+
4
+ require 'rubygems'
5
+ %w[rake hoe].each do |req_gem|
6
+ begin
7
+ require req_gem
8
+ rescue LoadError
9
+ puts "This Rakefile requires the '#{req_gem}' RubyGem."
10
+ puts "Installation: gem install #{req_gem} -y"
11
+ exit
12
+ end
13
+ end
14
+
15
+ $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
@@ -0,0 +1,34 @@
1
+ desc 'Release the website and new gem version'
2
+ task :deploy => [:check_version, :website, :release] do
3
+ puts "Remember to create SVN tag:"
4
+ puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
5
+ "svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
6
+ puts "Suggested comment:"
7
+ puts "Tagging release #{CHANGES}"
8
+ end
9
+
10
+ desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
11
+ task :local_deploy => [:website_generate, :install_gem]
12
+
13
+ task :check_version do
14
+ unless ENV['VERSION']
15
+ puts 'Must pass a VERSION=x.y.z release version'
16
+ exit
17
+ end
18
+ unless ENV['VERSION'] == VERS
19
+ puts "Please update your version.rb to match the release version, currently #{VERS}"
20
+ exit
21
+ end
22
+ end
23
+
24
+ desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
25
+ task :install_gem_no_doc => [:clean, :package] do
26
+ sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
27
+ end
28
+
29
+ namespace :manifest do
30
+ desc 'Recreate Manifest.txt to include ALL files'
31
+ task :refresh do
32
+ `rake check_manifest | patch -p0 > Manifest.txt`
33
+ end
34
+ end
@@ -0,0 +1,7 @@
1
+ task :ruby_env do
2
+ RUBY_APP = if RUBY_PLATFORM =~ /java/
3
+ "jruby"
4
+ else
5
+ "ruby"
6
+ end unless defined? RUBY_APP
7
+ end
@@ -0,0 +1,29 @@
1
+ require 'lib/semantic'
2
+
3
+ namespace :example do
4
+
5
+ documents = ["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]
6
+
7
+ desc "run main LSA example"
8
+ task :lsa do
9
+ search = Semantic::Search.new(documents, :verbose => true)
10
+ end
11
+
12
+ desc "run main Vector space example"
13
+ task :vector_space do
14
+ search = Semantic::Search.new(documents)
15
+
16
+ puts "Documents:"
17
+ documents.each_with_index { |document, index| puts "#{index}: #{document}" }
18
+ puts
19
+
20
+ puts "Documents related to first document: #{documents[0]}"
21
+ puts search.related(0)
22
+ puts
23
+
24
+ puts "Searching for the word cat:"
25
+ puts search.search(["cat"])
26
+ puts
27
+ end
28
+
29
+ end
@@ -0,0 +1,10 @@
1
+ desc 'Make all files use UNIX (\n) line endings'
2
+ task :fix_cr_lf do
3
+ files = FileList['**/*']
4
+ files.each do |f|
5
+ next if File.directory?(f)
6
+ s = IO.read(f)
7
+ s.gsub!(/\r?\n/, "\n")
8
+ File.open(f, "w") { |io| io.write(s) }
9
+ end
10
+ end
@@ -0,0 +1,6 @@
1
+ namespace :gemspec do
2
+ desc 'Refresh rsemantic.gemspec to include ALL files'
3
+ task :refresh => 'manifest:refresh' do
4
+ File.open('rsemantic.gemspec', 'w') {|io| io.write($hoe.spec.to_ruby)}
5
+ end
6
+ end
@@ -0,0 +1,33 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'spec'
6
+ end
7
+ begin
8
+ require 'spec/rake/spectask'
9
+ require 'spec/rake/verify_rcov'
10
+ rescue LoadError
11
+ puts <<-EOS
12
+ To use rspec for testing you must install rspec gem:
13
+ gem install rspec
14
+ EOS
15
+ exit(0)
16
+ end
17
+
18
+ desc "Run the specs under spec/models"
19
+ Spec::Rake::SpecTask.new do |t|
20
+ t.spec_opts = ['--options', "spec/spec.opts"]
21
+ t.spec_files = FileList['spec/**/*_spec.rb']
22
+
23
+ unless ENV['NO_RCOV']
24
+ t.rcov = true
25
+ t.rcov_dir = 'coverage'
26
+ t.rcov_opts = ['--exclude', '_helper\.rb,_spec\.rb,spec\/boss,\/var\/lib\/gems,\/Library\/Ruby,\.autotest']
27
+ end
28
+ end
29
+
30
+ RCov::VerifyTask.new(:verify_rcov => :spec) do |t|
31
+ t.threshold = 99.7 # Make sure you have rcov 0.9 or higher!
32
+ t.index_html = 'coverage/index.html'
33
+ end
@@ -0,0 +1,17 @@
1
+ desc 'Generate website files'
2
+ task :website_generate => :ruby_env do
3
+ (Dir['website/**/*.txt'] - Dir['website/version*.txt']).each do |txt|
4
+ sh %{ #{RUBY_APP} script/txt2html #{txt} > #{txt.gsub(/txt$/,'html')} }
5
+ end
6
+ end
7
+
8
+ desc 'Upload website files to rubyforge'
9
+ task :website_upload do
10
+ host = "#{rubyforge_username}@rubyforge.org"
11
+ remote_dir = "/var/www/gforge-projects/#{PATH}/"
12
+ local_dir = 'website'
13
+ sh %{rsync -aCv #{local_dir}/ #{host}:#{remote_dir}}
14
+ end
15
+
16
+ desc 'Generate and upload website files'
17
+ task :website => [:website_generate, :website_upload, :publish_docs]
data/lib/semantic.rb ADDED
@@ -0,0 +1,33 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require "semantic/vector_space"
5
+ require "semantic/compare"
6
+ require "semantic/parser"
7
+ require "semantic/matrix_transformer"
8
+ require "semantic/search"
9
+ require "semantic/transform"
10
+ require "semantic/version"
11
+
12
+ require 'rubygems'
13
+ require 'linalg'
14
+ #http://rubyforge.org/projects/stemmer/
15
+ #A processor for removing the commoner morphological and inflexional endings from words in English
16
+ require 'stemmer'
17
+ require 'logger'
18
+
19
+ module Semantic
20
+
21
+ class << self
22
+ attr_writer :logger
23
+ end
24
+
25
+ def self.logger
26
+ return @logger if @logger
27
+ @logger = Logger.new(STDOUT)
28
+ @logger.formatter = proc { |severity, time, progname, msg| "#{msg}\n" }
29
+ @logger.level = Logger::ERROR
30
+ @logger
31
+ end
32
+
33
+ end
@@ -0,0 +1,19 @@
1
+ module Semantic
2
+ class Compare
3
+
4
+ class << self
5
+
6
+ def similarity(vector1, vector2)
7
+ cosine(vector1, vector2)
8
+ end
9
+
10
+ def cosine(vector1, vector2)
11
+ unless vector2.nil? or vector1.nil?
12
+ (vector2.dot(vector1)) / (vector1.norm * vector2.norm)
13
+ end
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,25 @@
1
+ module Semantic
2
+ class MatrixTransformer
3
+
4
+ def initialize(options={})
5
+ @transforms = options[:transforms] || [:TFIDF, :LSA]
6
+ @options = options
7
+ end
8
+
9
+ def apply_transforms(vector_space_model)
10
+ @transforms.each do |transform|
11
+ begin
12
+ transform_class = Semantic::Transform.const_get(transform)
13
+ Semantic.logger.info("Applying #{transform} transform")
14
+ vector_space_model.matrix = transform_class.send(:transform, vector_space_model.matrix) if transform_class.respond_to?(:transform)
15
+ Semantic.logger.info(vector_space_model)
16
+ rescue Exception => e
17
+ Semantic.logger.error("Error: Cannot perform transform: #{transform}")
18
+ Semantic.logger.error(e)
19
+ end
20
+ end
21
+ vector_space_model
22
+ end
23
+
24
+ end
25
+ end