rsemantic 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/{README.txt → README.md} +19 -10
  2. data/lib/semantic.rb +8 -5
  3. data/lib/semantic/compare.rb +4 -1
  4. data/lib/semantic/corpus.rb +61 -0
  5. data/lib/semantic/document.rb +39 -0
  6. data/lib/semantic/matrix_transformer.rb +4 -5
  7. data/lib/semantic/parser.rb +22 -10
  8. data/lib/semantic/search.rb +22 -16
  9. data/lib/semantic/search_result.rb +16 -0
  10. data/lib/semantic/transform/lsa_transform.rb +47 -22
  11. data/lib/semantic/transform/tf_idf_transform.rb +12 -23
  12. data/lib/semantic/vector_space/builder.rb +29 -22
  13. data/lib/semantic/vector_space/model.rb +14 -13
  14. data/lib/semantic/version.rb +1 -1
  15. data/lib/tasks/rspec.rake +13 -0
  16. metadata +75 -107
  17. data/Manifest.txt +0 -38
  18. data/Rakefile +0 -9
  19. data/config/hoe.rb +0 -69
  20. data/config/requirements.rb +0 -15
  21. data/gem_tasks/deployment.rake +0 -34
  22. data/gem_tasks/environment.rake +0 -7
  23. data/gem_tasks/examples.rake +0 -29
  24. data/gem_tasks/fix_cr_lf.rake +0 -10
  25. data/gem_tasks/gemspec.rake +0 -6
  26. data/gem_tasks/rspec.rake +0 -33
  27. data/gem_tasks/website.rake +0 -17
  28. data/rsemantic.gemspec +0 -41
  29. data/spec/semantic/compare_spec.rb +0 -16
  30. data/spec/semantic/matrix_transformer_spec.rb +0 -51
  31. data/spec/semantic/parser_spec.rb +0 -34
  32. data/spec/semantic/search_spec.rb +0 -129
  33. data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
  34. data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
  35. data/spec/semantic/vector_space/builder_spec.rb +0 -44
  36. data/spec/semantic/vector_space/model_spec.rb +0 -22
  37. data/spec/spec.opts +0 -2
  38. data/spec/spec_helper.rb +0 -7
@@ -1,8 +1,6 @@
1
- = Rsemantic
1
+ # Rsemantic
2
2
 
3
- * http://github.com/josephwilk/rsemantic
4
-
5
- == DESCRIPTION:
3
+ [![Build Status](https://secure.travis-ci.org/josephwilk/rsemantic.png?branch=master)](http://travis-ci.org/josephwilk/rsemantic)
6
4
 
7
5
  A Ruby document vector search with flexible matrix transforms. Current supported transforms:
8
6
 
@@ -11,20 +9,31 @@ A Ruby document vector search with flexible matrix transforms. Current supported
11
9
 
12
10
  Documentation: http://github.com/josephwilk/rsemantic/wikis/home
13
11
 
14
- == REQUIREMENTS:
12
+ ## Requirements:
15
13
 
16
- * Linalg - http://rubyforge.org/projects/linalg/
14
+ * GSL - http://www.gnu.org/software/gsl
17
15
  * stemmer - http://rubyforge.org/projects/stemmer/
18
16
 
19
- == INSTALL:
17
+ ## INSTALL:
18
+
19
+ Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
20
+ See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
21
+
22
+ <pre><code>git clone git://github.com/josephwilk/rsemantic.git
23
+ cd rsemantic
24
+ brew install GSL
25
+ bundle install
26
+ </code></pre>
20
27
 
21
- * git clone git://github.com/josephwilk/rsemantic.git
28
+ ## Contributors
29
+ * @josephwilk
30
+ * @dominikhonnef
22
31
 
23
- == LICENSE
32
+ ## LICENSE
24
33
 
25
34
  (The MIT License)
26
35
 
27
- Copyright (c) 2008 Joseph Wilk
36
+ Copyright (c) 2008-2012 Joseph Wilk
28
37
 
29
38
  Permission is hereby granted, free of charge, to any person obtaining
30
39
  a copy of this software and associated documentation files (the
data/lib/semantic.rb CHANGED
@@ -9,10 +9,13 @@ require "semantic/search"
9
9
  require "semantic/transform"
10
10
  require "semantic/version"
11
11
 
12
+ require "semantic/corpus"
13
+ require "semantic/document"
14
+ require "semantic/search_result"
15
+
12
16
  require 'rubygems'
13
- require 'linalg'
14
- #http://rubyforge.org/projects/stemmer/
15
- #A processor for removing the commoner morphological and inflexional endings from words in English
17
+ require 'gsl'
18
+
16
19
  require 'stemmer'
17
20
  require 'logger'
18
21
 
@@ -21,7 +24,7 @@ module Semantic
21
24
  class << self
22
25
  attr_writer :logger
23
26
  end
24
-
27
+
25
28
  def self.logger
26
29
  return @logger if @logger
27
30
  @logger = Logger.new(STDOUT)
@@ -29,5 +32,5 @@ module Semantic
29
32
  @logger.level = Logger::ERROR
30
33
  @logger
31
34
  end
32
-
35
+
33
36
  end
@@ -9,7 +9,10 @@ module Semantic
9
9
 
10
10
  def cosine(vector1, vector2)
11
11
  unless vector2.nil? or vector1.nil?
12
- (vector2.dot(vector1)) / (vector1.norm * vector2.norm)
12
+ v1 = vector1.row
13
+ v2 = vector2
14
+ score = (v1 * v2) / (vector1.norm * vector2.norm)
15
+ score.nan? ? 0.0 : score
13
16
  end
14
17
  end
15
18
 
@@ -0,0 +1,61 @@
1
+ module Semantic
2
+ class Corpus
3
+ # @return [Array<Document>]
4
+ attr_reader :documents
5
+
6
+ # @param [Array<Document>] documents The {Document documents} to
7
+ # index
8
+ # @param [Hash] options
9
+ # TODO document options
10
+ def initialize(documents = [], options = {})
11
+ @documents = documents
12
+ @options = options
13
+ @search = nil
14
+ end
15
+
16
+ # Adds a new {Document document} to the index.
17
+ #
18
+ # @param [Document] document
19
+ # @return [void]
20
+ def add_document(document)
21
+ @documents << document
22
+ document.corpora << self
23
+ end
24
+ alias_method :<<, :add_document
25
+
26
+ # Build the index. This is required to be able to search for words
27
+ # or compute related documents.
28
+ #
29
+ # If you add new documents, you have to rebuild the index.
30
+ #
31
+ # @return [void]
32
+ def build_index
33
+ @search = Semantic::Search.new(@documents.map(&:text), @options)
34
+ end
35
+
36
+ def search(*words)
37
+ # TODO raise if no index built yet
38
+ results = @search.search(words)
39
+ results.map.with_index { |result, index|
40
+ document = @documents[index]
41
+ Semantic::SearchResult.new(document, result)
42
+ }.sort
43
+ end
44
+
45
+ def find_related_document(document)
46
+ @search.related(@documents.index(document)).map.with_index { |result, index|
47
+ document = @documents[index]
48
+ Semantic::SearchResult.new(document, result)
49
+ }.sort
50
+ end
51
+
52
+ def find_keywords(document, num = 5)
53
+ # TODO allow limiting keywords to words that occur in this document
54
+
55
+ end
56
+
57
+ def to_s
58
+ "#<%s %d documents, @options=%s>" % [self.class.name, @documents.size, @options.inspect]
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,39 @@
1
+ module Semantic
2
+ class Document
3
+ attr_reader :text
4
+ attr_reader :attributes
5
+ attr_reader :corpora
6
+ def initialize(text, attributes = {})
7
+ if text.respond_to?(:read)
8
+ @text = text.read
9
+ else
10
+ @text = text
11
+ end
12
+
13
+ @attributes = attributes
14
+ @corpora = []
15
+ end
16
+
17
+ def to_s
18
+ "#<%s @attributes=%s>" % [self.class.name, @attributes.inspect]
19
+ end
20
+
21
+ def [](key)
22
+ @attributes[key]
23
+ end
24
+
25
+ # @todo document that it has to be part of at least one corpus
26
+ def related
27
+ results = {}
28
+ @corpora.each do |corpus|
29
+ results[corpus] = corpus.find_related_document(self)
30
+ end
31
+
32
+ results
33
+ end
34
+
35
+ def keywords(num = 5)
36
+
37
+ end
38
+ end
39
+ end
@@ -1,9 +1,8 @@
1
1
  module Semantic
2
2
  class MatrixTransformer
3
3
 
4
- def initialize(options={})
5
- @transforms = options[:transforms] || [:TFIDF, :LSA]
6
- @options = options
4
+ def initialize(transforms)
5
+ @transforms = transforms
7
6
  end
8
7
 
9
8
  def apply_transforms(vector_space_model)
@@ -11,9 +10,9 @@ module Semantic
11
10
  begin
12
11
  transform_class = Semantic::Transform.const_get(transform)
13
12
  Semantic.logger.info("Applying #{transform} transform")
14
- vector_space_model.matrix = transform_class.send(:transform, vector_space_model.matrix) if transform_class.respond_to?(:transform)
13
+ transform_class.transform!(vector_space_model.matrix)
15
14
  Semantic.logger.info(vector_space_model)
16
- rescue Exception => e
15
+ rescue => e
17
16
  Semantic.logger.error("Error: Cannot perform transform: #{transform}")
18
17
  Semantic.logger.error(e)
19
18
  end
@@ -1,13 +1,17 @@
1
1
  require 'stemmer'
2
-
2
+ require "set"
3
3
  module Semantic
4
4
  class Parser
5
5
 
6
- def initialize
7
- #English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
8
- #TODO: nicer way to reference stop file location?
9
- File.open(File.dirname(__FILE__)+'/../../resources/english.stop', 'r') do |file|
10
- @stopwords = file.read().split()
6
+ def initialize(options = {})
7
+ # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
8
+ # TODO: nicer way to reference stop file location?
9
+ @filter_stop_words = options[:filter_stop_words]
10
+ @stem_words = options[:stem_words]
11
+ if @filter_stop_words
12
+ File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
13
+ @stopwords = Set.new(file.read().split())
14
+ end
11
15
  end
12
16
  end
13
17
 
@@ -16,7 +20,7 @@ module Semantic
16
20
  remove_stop_words(word_list)
17
21
  end
18
22
 
19
- #remove any nasty grammar tokens from string
23
+ # remove any nasty grammar tokens from string
20
24
  def clean(string)
21
25
  string = string.gsub(".","")
22
26
  string = string.gsub(/\s+/," ")
@@ -24,16 +28,24 @@ module Semantic
24
28
  return string
25
29
  end
26
30
 
27
- #stop words are common words which have no search value
31
+ # stop words are common words which have no search value
28
32
  def remove_stop_words(list)
29
- list.select {|word| word unless @stopwords.include? word }
33
+ if @filter_stop_words
34
+ list.select {|word| !@stopwords.include?(word) }
35
+ else
36
+ list
37
+ end
30
38
  end
31
39
 
32
40
  def tokenise_and_stem(string)
33
41
  string = clean(string)
34
42
  words = string.split(" ")
35
43
 
36
- words.map {|word| word.stem }
44
+ if @stem_words
45
+ words.map {|word| Stemmer.stem_word(word) }
46
+ else
47
+ words
48
+ end
37
49
  end
38
50
 
39
51
  end
@@ -1,35 +1,41 @@
1
1
  module Semantic
2
2
  class Search
3
3
 
4
- def initialize(documents, options={})
5
- Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
6
-
7
- @builder = VectorSpace::Builder.new(options)
8
- @matrix_transformer = MatrixTransformer.new(options)
4
+ def initialize(documents, options = {})
5
+ options = {
6
+ :transforms => [:TFIDF, :LSA],
7
+ :verbose => false,
8
+ :filter_stop_words => true,
9
+ :stem_words => true,
10
+ }.merge(options)
11
+ Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
+
13
+
14
+ @builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
15
+ @matrix_transformer = MatrixTransformer.new(options[:transforms])
9
16
 
10
17
  @vector_space_model = @builder.build_document_matrix(documents)
11
-
18
+
12
19
  Semantic.logger.info(@vector_space_model)
13
-
20
+
14
21
  @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
15
22
  end
16
-
17
- def related(documentId)
23
+
24
+ def related(document_id)
18
25
  ratings = []
19
- for index in (0...@vector_space_model.ncol)
20
- ratings << Compare.similarity(@vector_space_model.column(documentId), @vector_space_model.column(index))
26
+ @vector_space_model.each_column do |column|
27
+ ratings << Compare.similarity(@vector_space_model.column(document_id), column)
21
28
  end
22
29
  ratings
23
30
  end
24
31
 
25
- def search(searchList)
32
+ def search(search_list)
26
33
  ratings = []
27
- query_vector = @builder.build_query_vector(searchList)
28
- for index in (0...@vector_space_model.ncol)
29
- ratings << Compare.similarity(query_vector, @vector_space_model.column(index))
34
+ query_vector = @builder.build_query_vector(search_list)
35
+ @vector_space_model.each_column do |column|
36
+ ratings << Compare.similarity(query_vector.col, column)
30
37
  end
31
38
  ratings
32
39
  end
33
-
34
40
  end
35
41
  end
@@ -0,0 +1,16 @@
1
+ module Semantic
2
+ class SearchResult
3
+ include Comparable
4
+
5
+ attr_reader :document
6
+ attr_reader :score
7
+ def initialize(document, score)
8
+ @document = document
9
+ @score = score
10
+ end
11
+
12
+ def <=>(other)
13
+ @score <=> other.score
14
+ end
15
+ end
16
+ end
@@ -4,39 +4,64 @@ module Semantic
4
4
 
5
5
  class << self
6
6
 
7
- def transform(matrix, number_of_dimensions_to_reduce = 1)
8
- columns = matrix.num_columns
7
+ def transform!(matrix, rank = nil)
8
+ # TODO configurable rank
9
+ columns = matrix.size2
9
10
 
10
- if number_of_dimensions_to_reduce <= columns #Its a valid reduction
11
- u, sigma, vt = matrix.singular_value_decomposition
11
+ u, v, sigma = matrix.SV_decomp_mod
12
+ reduce_dimensions!(sigma, rank)
13
+ sigma = GSL::Matrix.diagonal(sigma)
12
14
 
13
- sigma_prime = reduce_dimensions(number_of_dimensions_to_reduce, sigma)
15
+ GSL::Matrix.swap(matrix, u * sigma * v.transpose)
16
+ end
14
17
 
15
- matrix_prime = u * sigma_prime * vt
18
+ private
19
+ def reduce_dimensions!(vector, rank)
20
+ # the vector is already sorted (biggest to smallest), so we
21
+ # only have to zero the elements we do not want
22
+
23
+ if rank.nil?
24
+ rank = determine_rank(vector)
16
25
  else
17
- raise Exception, "dimension reduction cannot be greater than %s" % columns
26
+ rank = valid_rank(vector, rank)
18
27
  end
19
-
20
- matrix_prime
28
+
29
+ num_to_zero_out = vector.size - rank
30
+ vector[rank, num_to_zero_out] = 0
21
31
  end
22
-
23
- private
24
- def reduce_dimensions(number_of_dimensions_to_reduce, matrix)
25
- for diagonal_index in dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
26
- matrix[diagonal_index, diagonal_index] = 0
32
+
33
+ def determine_rank(vector)
34
+ if vector.size <= 15
35
+ # for less than 15 documents, n-1 is usually the best we
36
+ # can do. LSA generally works better with bigger data
37
+ # sets.
38
+ rank = vector.size - 1
39
+ elsif vector.size <= 1000
40
+ # ~500 is a value to work well for really big data sets,
41
+ # but for less than that, it probably is too big, so we
42
+ # go for n/3 in this case.
43
+ rank = vector.size / 3
44
+ else
45
+ # if we have more than 1000 documents, using the magical
46
+ # number 500 (which can be found in various documents)
47
+ # seems to be the best guess for now.
48
+ rank = 500
27
49
  end
28
- matrix
29
- end
30
-
31
- def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
32
- (diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
33
50
  end
34
51
 
35
- def diagonal_matrix_length(matrix)
36
- matrix.num_columns < matrix.num_rows ? matrix.num_columns : matrix.num_rows
52
+ def valid_rank(vector, rank)
53
+ if rank <= 0
54
+ # for negative ranks, keep that many dimensions
55
+ rank = vector.size + rank
56
+ elsif rank > vector.size
57
+ # if the rank is > the vector size, limit it to that
58
+ rank = vector.size
59
+ else
60
+ rank
61
+ end
37
62
  end
38
63
 
39
64
  end
40
65
  end
41
66
  end
42
- end
67
+ end