rsemantic 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/{README.txt → README.md} +19 -10
  2. data/lib/semantic.rb +8 -5
  3. data/lib/semantic/compare.rb +4 -1
  4. data/lib/semantic/corpus.rb +61 -0
  5. data/lib/semantic/document.rb +39 -0
  6. data/lib/semantic/matrix_transformer.rb +4 -5
  7. data/lib/semantic/parser.rb +22 -10
  8. data/lib/semantic/search.rb +22 -16
  9. data/lib/semantic/search_result.rb +16 -0
  10. data/lib/semantic/transform/lsa_transform.rb +47 -22
  11. data/lib/semantic/transform/tf_idf_transform.rb +12 -23
  12. data/lib/semantic/vector_space/builder.rb +29 -22
  13. data/lib/semantic/vector_space/model.rb +14 -13
  14. data/lib/semantic/version.rb +1 -1
  15. data/lib/tasks/rspec.rake +13 -0
  16. metadata +75 -107
  17. data/Manifest.txt +0 -38
  18. data/Rakefile +0 -9
  19. data/config/hoe.rb +0 -69
  20. data/config/requirements.rb +0 -15
  21. data/gem_tasks/deployment.rake +0 -34
  22. data/gem_tasks/environment.rake +0 -7
  23. data/gem_tasks/examples.rake +0 -29
  24. data/gem_tasks/fix_cr_lf.rake +0 -10
  25. data/gem_tasks/gemspec.rake +0 -6
  26. data/gem_tasks/rspec.rake +0 -33
  27. data/gem_tasks/website.rake +0 -17
  28. data/rsemantic.gemspec +0 -41
  29. data/spec/semantic/compare_spec.rb +0 -16
  30. data/spec/semantic/matrix_transformer_spec.rb +0 -51
  31. data/spec/semantic/parser_spec.rb +0 -34
  32. data/spec/semantic/search_spec.rb +0 -129
  33. data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
  34. data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
  35. data/spec/semantic/vector_space/builder_spec.rb +0 -44
  36. data/spec/semantic/vector_space/model_spec.rb +0 -22
  37. data/spec/spec.opts +0 -2
  38. data/spec/spec_helper.rb +0 -7
@@ -1,8 +1,6 @@
1
- = Rsemantic
1
+ # Rsemantic
2
2
 
3
- * http://github.com/josephwilk/rsemantic
4
-
5
- == DESCRIPTION:
3
+ [![Build Status](https://secure.travis-ci.org/josephwilk/rsemantic.png?branch=master)](http://travis-ci.org/josephwilk/rsemantic)
6
4
 
7
5
  A Ruby document vector search with flexible matrix transforms. Current supported transforms:
8
6
 
@@ -11,20 +9,31 @@ A Ruby document vector search with flexible matrix transforms. Current supported
11
9
 
12
10
  Documentation: http://github.com/josephwilk/rsemantic/wikis/home
13
11
 
14
- == REQUIREMENTS:
12
+ ## Requirements:
15
13
 
16
- * Linalg - http://rubyforge.org/projects/linalg/
14
+ * GSL - http://www.gnu.org/software/gsl
17
15
  * stemmer - http://rubyforge.org/projects/stemmer/
18
16
 
19
- == INSTALL:
17
+ ## INSTALL:
18
+
19
+ Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
20
+ See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
21
+
22
+ <pre><code>git clone git://github.com/josephwilk/rsemantic.git
23
+ cd rsemantic
24
+ brew install GSL
25
+ bundle install
26
+ </code></pre>
20
27
 
21
- * git clone git://github.com/josephwilk/rsemantic.git
28
+ ## Contributors
29
+ * @josephwilk
30
+ * @dominikhonnef
22
31
 
23
- == LICENSE
32
+ ## LICENSE
24
33
 
25
34
  (The MIT License)
26
35
 
27
- Copyright (c) 2008 Joseph Wilk
36
+ Copyright (c) 2008-2012 Joseph Wilk
28
37
 
29
38
  Permission is hereby granted, free of charge, to any person obtaining
30
39
  a copy of this software and associated documentation files (the
data/lib/semantic.rb CHANGED
@@ -9,10 +9,13 @@ require "semantic/search"
9
9
  require "semantic/transform"
10
10
  require "semantic/version"
11
11
 
12
+ require "semantic/corpus"
13
+ require "semantic/document"
14
+ require "semantic/search_result"
15
+
12
16
  require 'rubygems'
13
- require 'linalg'
14
- #http://rubyforge.org/projects/stemmer/
15
- #A processor for removing the commoner morphological and inflexional endings from words in English
17
+ require 'gsl'
18
+
16
19
  require 'stemmer'
17
20
  require 'logger'
18
21
 
@@ -21,7 +24,7 @@ module Semantic
21
24
  class << self
22
25
  attr_writer :logger
23
26
  end
24
-
27
+
25
28
  def self.logger
26
29
  return @logger if @logger
27
30
  @logger = Logger.new(STDOUT)
@@ -29,5 +32,5 @@ module Semantic
29
32
  @logger.level = Logger::ERROR
30
33
  @logger
31
34
  end
32
-
35
+
33
36
  end
@@ -9,7 +9,10 @@ module Semantic
9
9
 
10
10
  def cosine(vector1, vector2)
11
11
  unless vector2.nil? or vector1.nil?
12
- (vector2.dot(vector1)) / (vector1.norm * vector2.norm)
12
+ v1 = vector1.row
13
+ v2 = vector2
14
+ score = (v1 * v2) / (vector1.norm * vector2.norm)
15
+ score.nan? ? 0.0 : score
13
16
  end
14
17
  end
15
18
 
@@ -0,0 +1,61 @@
1
+ module Semantic
2
+ class Corpus
3
+ # @return [Array<Document>]
4
+ attr_reader :documents
5
+
6
+ # @param [Array<Document>] documents The {Document documents} to
7
+ # index
8
+ # @param [Hash] options
9
+ # TODO document options
10
+ def initialize(documents = [], options = {})
11
+ @documents = documents
12
+ @options = options
13
+ @search = nil
14
+ end
15
+
16
+ # Adds a new {Document document} to the index.
17
+ #
18
+ # @param [Document] document
19
+ # @return [void]
20
+ def add_document(document)
21
+ @documents << document
22
+ document.corpora << self
23
+ end
24
+ alias_method :<<, :add_document
25
+
26
+ # Build the index. This is required to be able to search for words
27
+ # or compute related documents.
28
+ #
29
+ # If you add new documents, you have to rebuild the index.
30
+ #
31
+ # @return [void]
32
+ def build_index
33
+ @search = Semantic::Search.new(@documents.map(&:text), @options)
34
+ end
35
+
36
+ def search(*words)
37
+ # TODO raise if no index built yet
38
+ results = @search.search(words)
39
+ results.map.with_index { |result, index|
40
+ document = @documents[index]
41
+ Semantic::SearchResult.new(document, result)
42
+ }.sort
43
+ end
44
+
45
+ def find_related_document(document)
46
+ @search.related(@documents.index(document)).map.with_index { |result, index|
47
+ document = @documents[index]
48
+ Semantic::SearchResult.new(document, result)
49
+ }.sort
50
+ end
51
+
52
+ def find_keywords(document, num = 5)
53
+ # TODO allow limiting keywords to words that occur in this document
54
+
55
+ end
56
+
57
+ def to_s
58
+ "#<%s %d documents, @options=%s>" % [self.class.name, @documents.size, @options.inspect]
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,39 @@
1
+ module Semantic
2
+ class Document
3
+ attr_reader :text
4
+ attr_reader :attributes
5
+ attr_reader :corpora
6
+ def initialize(text, attributes = {})
7
+ if text.respond_to?(:read)
8
+ @text = text.read
9
+ else
10
+ @text = text
11
+ end
12
+
13
+ @attributes = attributes
14
+ @corpora = []
15
+ end
16
+
17
+ def to_s
18
+ "#<%s @attributes=%s>" % [self.class.name, @attributes.inspect]
19
+ end
20
+
21
+ def [](key)
22
+ @attributes[key]
23
+ end
24
+
25
+ # @todo document that it has to be part of at least one corpus
26
+ def related
27
+ results = {}
28
+ @corpora.each do |corpus|
29
+ results[corpus] = corpus.find_related_document(self)
30
+ end
31
+
32
+ results
33
+ end
34
+
35
+ def keywords(num = 5)
36
+
37
+ end
38
+ end
39
+ end
@@ -1,9 +1,8 @@
1
1
  module Semantic
2
2
  class MatrixTransformer
3
3
 
4
- def initialize(options={})
5
- @transforms = options[:transforms] || [:TFIDF, :LSA]
6
- @options = options
4
+ def initialize(transforms)
5
+ @transforms = transforms
7
6
  end
8
7
 
9
8
  def apply_transforms(vector_space_model)
@@ -11,9 +10,9 @@ module Semantic
11
10
  begin
12
11
  transform_class = Semantic::Transform.const_get(transform)
13
12
  Semantic.logger.info("Applying #{transform} transform")
14
- vector_space_model.matrix = transform_class.send(:transform, vector_space_model.matrix) if transform_class.respond_to?(:transform)
13
+ transform_class.transform!(vector_space_model.matrix)
15
14
  Semantic.logger.info(vector_space_model)
16
- rescue Exception => e
15
+ rescue => e
17
16
  Semantic.logger.error("Error: Cannot perform transform: #{transform}")
18
17
  Semantic.logger.error(e)
19
18
  end
@@ -1,13 +1,17 @@
1
1
  require 'stemmer'
2
-
2
+ require "set"
3
3
  module Semantic
4
4
  class Parser
5
5
 
6
- def initialize
7
- #English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
8
- #TODO: nicer way to reference stop file location?
9
- File.open(File.dirname(__FILE__)+'/../../resources/english.stop', 'r') do |file|
10
- @stopwords = file.read().split()
6
+ def initialize(options = {})
7
+ # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
8
+ # TODO: nicer way to reference stop file location?
9
+ @filter_stop_words = options[:filter_stop_words]
10
+ @stem_words = options[:stem_words]
11
+ if @filter_stop_words
12
+ File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
13
+ @stopwords = Set.new(file.read().split())
14
+ end
11
15
  end
12
16
  end
13
17
 
@@ -16,7 +20,7 @@ module Semantic
16
20
  remove_stop_words(word_list)
17
21
  end
18
22
 
19
- #remove any nasty grammar tokens from string
23
+ # remove any nasty grammar tokens from string
20
24
  def clean(string)
21
25
  string = string.gsub(".","")
22
26
  string = string.gsub(/\s+/," ")
@@ -24,16 +28,24 @@ module Semantic
24
28
  return string
25
29
  end
26
30
 
27
- #stop words are common words which have no search value
31
+ # stop words are common words which have no search value
28
32
  def remove_stop_words(list)
29
- list.select {|word| word unless @stopwords.include? word }
33
+ if @filter_stop_words
34
+ list.select {|word| !@stopwords.include?(word) }
35
+ else
36
+ list
37
+ end
30
38
  end
31
39
 
32
40
  def tokenise_and_stem(string)
33
41
  string = clean(string)
34
42
  words = string.split(" ")
35
43
 
36
- words.map {|word| word.stem }
44
+ if @stem_words
45
+ words.map {|word| Stemmer.stem_word(word) }
46
+ else
47
+ words
48
+ end
37
49
  end
38
50
 
39
51
  end
@@ -1,35 +1,41 @@
1
1
  module Semantic
2
2
  class Search
3
3
 
4
- def initialize(documents, options={})
5
- Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
6
-
7
- @builder = VectorSpace::Builder.new(options)
8
- @matrix_transformer = MatrixTransformer.new(options)
4
+ def initialize(documents, options = {})
5
+ options = {
6
+ :transforms => [:TFIDF, :LSA],
7
+ :verbose => false,
8
+ :filter_stop_words => true,
9
+ :stem_words => true,
10
+ }.merge(options)
11
+ Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
+
13
+
14
+ @builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
15
+ @matrix_transformer = MatrixTransformer.new(options[:transforms])
9
16
 
10
17
  @vector_space_model = @builder.build_document_matrix(documents)
11
-
18
+
12
19
  Semantic.logger.info(@vector_space_model)
13
-
20
+
14
21
  @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
15
22
  end
16
-
17
- def related(documentId)
23
+
24
+ def related(document_id)
18
25
  ratings = []
19
- for index in (0...@vector_space_model.ncol)
20
- ratings << Compare.similarity(@vector_space_model.column(documentId), @vector_space_model.column(index))
26
+ @vector_space_model.each_column do |column|
27
+ ratings << Compare.similarity(@vector_space_model.column(document_id), column)
21
28
  end
22
29
  ratings
23
30
  end
24
31
 
25
- def search(searchList)
32
+ def search(search_list)
26
33
  ratings = []
27
- query_vector = @builder.build_query_vector(searchList)
28
- for index in (0...@vector_space_model.ncol)
29
- ratings << Compare.similarity(query_vector, @vector_space_model.column(index))
34
+ query_vector = @builder.build_query_vector(search_list)
35
+ @vector_space_model.each_column do |column|
36
+ ratings << Compare.similarity(query_vector.col, column)
30
37
  end
31
38
  ratings
32
39
  end
33
-
34
40
  end
35
41
  end
@@ -0,0 +1,16 @@
1
+ module Semantic
2
+ class SearchResult
3
+ include Comparable
4
+
5
+ attr_reader :document
6
+ attr_reader :score
7
+ def initialize(document, score)
8
+ @document = document
9
+ @score = score
10
+ end
11
+
12
+ def <=>(other)
13
+ @score <=> other.score
14
+ end
15
+ end
16
+ end
@@ -4,39 +4,64 @@ module Semantic
4
4
 
5
5
  class << self
6
6
 
7
- def transform(matrix, number_of_dimensions_to_reduce = 1)
8
- columns = matrix.num_columns
7
+ def transform!(matrix, rank = nil)
8
+ # TODO configurable rank
9
+ columns = matrix.size2
9
10
 
10
- if number_of_dimensions_to_reduce <= columns #Its a valid reduction
11
- u, sigma, vt = matrix.singular_value_decomposition
11
+ u, v, sigma = matrix.SV_decomp_mod
12
+ reduce_dimensions!(sigma, rank)
13
+ sigma = GSL::Matrix.diagonal(sigma)
12
14
 
13
- sigma_prime = reduce_dimensions(number_of_dimensions_to_reduce, sigma)
15
+ GSL::Matrix.swap(matrix, u * sigma * v.transpose)
16
+ end
14
17
 
15
- matrix_prime = u * sigma_prime * vt
18
+ private
19
+ def reduce_dimensions!(vector, rank)
20
+ # the vector is already sorted (biggest to smallest), so we
21
+ # only have to zero the elements we do not want
22
+
23
+ if rank.nil?
24
+ rank = determine_rank(vector)
16
25
  else
17
- raise Exception, "dimension reduction cannot be greater than %s" % columns
26
+ rank = valid_rank(vector, rank)
18
27
  end
19
-
20
- matrix_prime
28
+
29
+ num_to_zero_out = vector.size - rank
30
+ vector[rank, num_to_zero_out] = 0
21
31
  end
22
-
23
- private
24
- def reduce_dimensions(number_of_dimensions_to_reduce, matrix)
25
- for diagonal_index in dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
26
- matrix[diagonal_index, diagonal_index] = 0
32
+
33
+ def determine_rank(vector)
34
+ if vector.size <= 15
35
+ # for less than 15 documents, n-1 is usually the best we
36
+ # can do. LSA generally works better with bigger data
37
+ # sets.
38
+ rank = vector.size - 1
39
+ elsif vector.size <= 1000
40
+ # ~500 is a value to work well for really big data sets,
41
+ # but for less than that, it probably is too big, so we
42
+ # go for n/3 in this case.
43
+ rank = vector.size / 3
44
+ else
45
+ # if we have more than 1000 documents, using the magical
46
+ # number 500 (which can be found in various documents)
47
+ # seems to be the best guess for now.
48
+ rank = 500
27
49
  end
28
- matrix
29
- end
30
-
31
- def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
32
- (diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
33
50
  end
34
51
 
35
- def diagonal_matrix_length(matrix)
36
- matrix.num_columns < matrix.num_rows ? matrix.num_columns : matrix.num_rows
52
+ def valid_rank(vector, rank)
53
+ if rank <= 0
54
+ # for negative ranks, keep that many dimensions
55
+ rank = vector.size + rank
56
+ elsif rank > vector.size
57
+ # if the rank is > the vector size, limit it to that
58
+ rank = vector.size
59
+ else
60
+ rank
61
+ end
37
62
  end
38
63
 
39
64
  end
40
65
  end
41
66
  end
42
- end
67
+ end