rsemantic 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README.txt → README.md} +19 -10
- data/lib/semantic.rb +8 -5
- data/lib/semantic/compare.rb +4 -1
- data/lib/semantic/corpus.rb +61 -0
- data/lib/semantic/document.rb +39 -0
- data/lib/semantic/matrix_transformer.rb +4 -5
- data/lib/semantic/parser.rb +22 -10
- data/lib/semantic/search.rb +22 -16
- data/lib/semantic/search_result.rb +16 -0
- data/lib/semantic/transform/lsa_transform.rb +47 -22
- data/lib/semantic/transform/tf_idf_transform.rb +12 -23
- data/lib/semantic/vector_space/builder.rb +29 -22
- data/lib/semantic/vector_space/model.rb +14 -13
- data/lib/semantic/version.rb +1 -1
- data/lib/tasks/rspec.rake +13 -0
- metadata +75 -107
- data/Manifest.txt +0 -38
- data/Rakefile +0 -9
- data/config/hoe.rb +0 -69
- data/config/requirements.rb +0 -15
- data/gem_tasks/deployment.rake +0 -34
- data/gem_tasks/environment.rake +0 -7
- data/gem_tasks/examples.rake +0 -29
- data/gem_tasks/fix_cr_lf.rake +0 -10
- data/gem_tasks/gemspec.rake +0 -6
- data/gem_tasks/rspec.rake +0 -33
- data/gem_tasks/website.rake +0 -17
- data/rsemantic.gemspec +0 -41
- data/spec/semantic/compare_spec.rb +0 -16
- data/spec/semantic/matrix_transformer_spec.rb +0 -51
- data/spec/semantic/parser_spec.rb +0 -34
- data/spec/semantic/search_spec.rb +0 -129
- data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
- data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
- data/spec/semantic/vector_space/builder_spec.rb +0 -44
- data/spec/semantic/vector_space/model_spec.rb +0 -22
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -7
data/{README.txt → README.md}
RENAMED
@@ -1,8 +1,6 @@
|
|
1
|
-
|
1
|
+
# Rsemantic
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
== DESCRIPTION:
|
3
|
+
[](http://travis-ci.org/josephwilk/rsemantic)
|
6
4
|
|
7
5
|
A Ruby document vector search with flexible matrix transforms. Current supported transforms:
|
8
6
|
|
@@ -11,20 +9,31 @@ A Ruby document vector search with flexible matrix transforms. Current supported
|
|
11
9
|
|
12
10
|
Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
13
11
|
|
14
|
-
|
12
|
+
## Requirements:
|
15
13
|
|
16
|
-
*
|
14
|
+
* GSL - http://www.gnu.org/software/gsl
|
17
15
|
* stemmer - http://rubyforge.org/projects/stemmer/
|
18
16
|
|
19
|
-
|
17
|
+
## INSTALL:
|
18
|
+
|
19
|
+
Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
|
20
|
+
See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
|
21
|
+
|
22
|
+
<pre><code>git clone git://github.com/josephwilk/rsemantic.git
|
23
|
+
cd rsemantic
|
24
|
+
brew install GSL
|
25
|
+
bundle install
|
26
|
+
</code></pre>
|
20
27
|
|
21
|
-
|
28
|
+
## Contributors
|
29
|
+
* @josephwilk
|
30
|
+
* @dominikhonnef
|
22
31
|
|
23
|
-
|
32
|
+
## LICENSE
|
24
33
|
|
25
34
|
(The MIT License)
|
26
35
|
|
27
|
-
Copyright (c) 2008 Joseph Wilk
|
36
|
+
Copyright (c) 2008-2012 Joseph Wilk
|
28
37
|
|
29
38
|
Permission is hereby granted, free of charge, to any person obtaining
|
30
39
|
a copy of this software and associated documentation files (the
|
data/lib/semantic.rb
CHANGED
@@ -9,10 +9,13 @@ require "semantic/search"
|
|
9
9
|
require "semantic/transform"
|
10
10
|
require "semantic/version"
|
11
11
|
|
12
|
+
require "semantic/corpus"
|
13
|
+
require "semantic/document"
|
14
|
+
require "semantic/search_result"
|
15
|
+
|
12
16
|
require 'rubygems'
|
13
|
-
require '
|
14
|
-
|
15
|
-
#A processor for removing the commoner morphological and inflexional endings from words in English
|
17
|
+
require 'gsl'
|
18
|
+
|
16
19
|
require 'stemmer'
|
17
20
|
require 'logger'
|
18
21
|
|
@@ -21,7 +24,7 @@ module Semantic
|
|
21
24
|
class << self
|
22
25
|
attr_writer :logger
|
23
26
|
end
|
24
|
-
|
27
|
+
|
25
28
|
def self.logger
|
26
29
|
return @logger if @logger
|
27
30
|
@logger = Logger.new(STDOUT)
|
@@ -29,5 +32,5 @@ module Semantic
|
|
29
32
|
@logger.level = Logger::ERROR
|
30
33
|
@logger
|
31
34
|
end
|
32
|
-
|
35
|
+
|
33
36
|
end
|
data/lib/semantic/compare.rb
CHANGED
@@ -9,7 +9,10 @@ module Semantic
|
|
9
9
|
|
10
10
|
def cosine(vector1, vector2)
|
11
11
|
unless vector2.nil? or vector1.nil?
|
12
|
-
|
12
|
+
v1 = vector1.row
|
13
|
+
v2 = vector2
|
14
|
+
score = (v1 * v2) / (vector1.norm * vector2.norm)
|
15
|
+
score.nan? ? 0.0 : score
|
13
16
|
end
|
14
17
|
end
|
15
18
|
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Semantic
|
2
|
+
class Corpus
|
3
|
+
# @return [Array<Document>]
|
4
|
+
attr_reader :documents
|
5
|
+
|
6
|
+
# @param [Array<Document>] documents The {Document documents} to
|
7
|
+
# index
|
8
|
+
# @param [Hash] options
|
9
|
+
# TODO document options
|
10
|
+
def initialize(documents = [], options = {})
|
11
|
+
@documents = documents
|
12
|
+
@options = options
|
13
|
+
@search = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Adds a new {Document document} to the index.
|
17
|
+
#
|
18
|
+
# @param [Document] document
|
19
|
+
# @return [void]
|
20
|
+
def add_document(document)
|
21
|
+
@documents << document
|
22
|
+
document.corpora << self
|
23
|
+
end
|
24
|
+
alias_method :<<, :add_document
|
25
|
+
|
26
|
+
# Build the index. This is required to be able to search for words
|
27
|
+
# or compute related documents.
|
28
|
+
#
|
29
|
+
# If you add new documents, you have to rebuild the index.
|
30
|
+
#
|
31
|
+
# @return [void]
|
32
|
+
def build_index
|
33
|
+
@search = Semantic::Search.new(@documents.map(&:text), @options)
|
34
|
+
end
|
35
|
+
|
36
|
+
def search(*words)
|
37
|
+
# TODO raise if no index built yet
|
38
|
+
results = @search.search(words)
|
39
|
+
results.map.with_index { |result, index|
|
40
|
+
document = @documents[index]
|
41
|
+
Semantic::SearchResult.new(document, result)
|
42
|
+
}.sort
|
43
|
+
end
|
44
|
+
|
45
|
+
def find_related_document(document)
|
46
|
+
@search.related(@documents.index(document)).map.with_index { |result, index|
|
47
|
+
document = @documents[index]
|
48
|
+
Semantic::SearchResult.new(document, result)
|
49
|
+
}.sort
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_keywords(document, num = 5)
|
53
|
+
# TODO allow limiting keywords to words that occur in this document
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_s
|
58
|
+
"#<%s %d documents, @options=%s>" % [self.class.name, @documents.size, @options.inspect]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Semantic
|
2
|
+
class Document
|
3
|
+
attr_reader :text
|
4
|
+
attr_reader :attributes
|
5
|
+
attr_reader :corpora
|
6
|
+
def initialize(text, attributes = {})
|
7
|
+
if text.respond_to?(:read)
|
8
|
+
@text = text.read
|
9
|
+
else
|
10
|
+
@text = text
|
11
|
+
end
|
12
|
+
|
13
|
+
@attributes = attributes
|
14
|
+
@corpora = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"#<%s @attributes=%s>" % [self.class.name, @attributes.inspect]
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](key)
|
22
|
+
@attributes[key]
|
23
|
+
end
|
24
|
+
|
25
|
+
# @todo document that it has to be part of at least one corpus
|
26
|
+
def related
|
27
|
+
results = {}
|
28
|
+
@corpora.each do |corpus|
|
29
|
+
results[corpus] = corpus.find_related_document(self)
|
30
|
+
end
|
31
|
+
|
32
|
+
results
|
33
|
+
end
|
34
|
+
|
35
|
+
def keywords(num = 5)
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,9 +1,8 @@
|
|
1
1
|
module Semantic
|
2
2
|
class MatrixTransformer
|
3
3
|
|
4
|
-
def initialize(
|
5
|
-
@transforms =
|
6
|
-
@options = options
|
4
|
+
def initialize(transforms)
|
5
|
+
@transforms = transforms
|
7
6
|
end
|
8
7
|
|
9
8
|
def apply_transforms(vector_space_model)
|
@@ -11,9 +10,9 @@ module Semantic
|
|
11
10
|
begin
|
12
11
|
transform_class = Semantic::Transform.const_get(transform)
|
13
12
|
Semantic.logger.info("Applying #{transform} transform")
|
14
|
-
|
13
|
+
transform_class.transform!(vector_space_model.matrix)
|
15
14
|
Semantic.logger.info(vector_space_model)
|
16
|
-
rescue
|
15
|
+
rescue => e
|
17
16
|
Semantic.logger.error("Error: Cannot perform transform: #{transform}")
|
18
17
|
Semantic.logger.error(e)
|
19
18
|
end
|
data/lib/semantic/parser.rb
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
require 'stemmer'
|
2
|
-
|
2
|
+
require "set"
|
3
3
|
module Semantic
|
4
4
|
class Parser
|
5
5
|
|
6
|
-
def initialize
|
7
|
-
#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
8
|
-
#TODO: nicer way to reference stop file location?
|
9
|
-
|
10
|
-
|
6
|
+
def initialize(options = {})
|
7
|
+
# English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
8
|
+
# TODO: nicer way to reference stop file location?
|
9
|
+
@filter_stop_words = options[:filter_stop_words]
|
10
|
+
@stem_words = options[:stem_words]
|
11
|
+
if @filter_stop_words
|
12
|
+
File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
|
13
|
+
@stopwords = Set.new(file.read().split())
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
@@ -16,7 +20,7 @@ module Semantic
|
|
16
20
|
remove_stop_words(word_list)
|
17
21
|
end
|
18
22
|
|
19
|
-
#remove any nasty grammar tokens from string
|
23
|
+
# remove any nasty grammar tokens from string
|
20
24
|
def clean(string)
|
21
25
|
string = string.gsub(".","")
|
22
26
|
string = string.gsub(/\s+/," ")
|
@@ -24,16 +28,24 @@ module Semantic
|
|
24
28
|
return string
|
25
29
|
end
|
26
30
|
|
27
|
-
#stop words are common words which have no search value
|
31
|
+
# stop words are common words which have no search value
|
28
32
|
def remove_stop_words(list)
|
29
|
-
|
33
|
+
if @filter_stop_words
|
34
|
+
list.select {|word| !@stopwords.include?(word) }
|
35
|
+
else
|
36
|
+
list
|
37
|
+
end
|
30
38
|
end
|
31
39
|
|
32
40
|
def tokenise_and_stem(string)
|
33
41
|
string = clean(string)
|
34
42
|
words = string.split(" ")
|
35
43
|
|
36
|
-
|
44
|
+
if @stem_words
|
45
|
+
words.map {|word| Stemmer.stem_word(word) }
|
46
|
+
else
|
47
|
+
words
|
48
|
+
end
|
37
49
|
end
|
38
50
|
|
39
51
|
end
|
data/lib/semantic/search.rb
CHANGED
@@ -1,35 +1,41 @@
|
|
1
1
|
module Semantic
|
2
2
|
class Search
|
3
3
|
|
4
|
-
def initialize(documents, options={})
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
def initialize(documents, options = {})
|
5
|
+
options = {
|
6
|
+
:transforms => [:TFIDF, :LSA],
|
7
|
+
:verbose => false,
|
8
|
+
:filter_stop_words => true,
|
9
|
+
:stem_words => true,
|
10
|
+
}.merge(options)
|
11
|
+
Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
|
12
|
+
|
13
|
+
|
14
|
+
@builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
|
15
|
+
@matrix_transformer = MatrixTransformer.new(options[:transforms])
|
9
16
|
|
10
17
|
@vector_space_model = @builder.build_document_matrix(documents)
|
11
|
-
|
18
|
+
|
12
19
|
Semantic.logger.info(@vector_space_model)
|
13
|
-
|
20
|
+
|
14
21
|
@vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
|
15
22
|
end
|
16
|
-
|
17
|
-
def related(
|
23
|
+
|
24
|
+
def related(document_id)
|
18
25
|
ratings = []
|
19
|
-
|
20
|
-
ratings << Compare.similarity(@vector_space_model.column(
|
26
|
+
@vector_space_model.each_column do |column|
|
27
|
+
ratings << Compare.similarity(@vector_space_model.column(document_id), column)
|
21
28
|
end
|
22
29
|
ratings
|
23
30
|
end
|
24
31
|
|
25
|
-
def search(
|
32
|
+
def search(search_list)
|
26
33
|
ratings = []
|
27
|
-
query_vector = @builder.build_query_vector(
|
28
|
-
|
29
|
-
ratings << Compare.similarity(query_vector,
|
34
|
+
query_vector = @builder.build_query_vector(search_list)
|
35
|
+
@vector_space_model.each_column do |column|
|
36
|
+
ratings << Compare.similarity(query_vector.col, column)
|
30
37
|
end
|
31
38
|
ratings
|
32
39
|
end
|
33
|
-
|
34
40
|
end
|
35
41
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Semantic
|
2
|
+
class SearchResult
|
3
|
+
include Comparable
|
4
|
+
|
5
|
+
attr_reader :document
|
6
|
+
attr_reader :score
|
7
|
+
def initialize(document, score)
|
8
|
+
@document = document
|
9
|
+
@score = score
|
10
|
+
end
|
11
|
+
|
12
|
+
def <=>(other)
|
13
|
+
@score <=> other.score
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -4,39 +4,64 @@ module Semantic
|
|
4
4
|
|
5
5
|
class << self
|
6
6
|
|
7
|
-
def transform(matrix,
|
8
|
-
|
7
|
+
def transform!(matrix, rank = nil)
|
8
|
+
# TODO configurable rank
|
9
|
+
columns = matrix.size2
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
u, v, sigma = matrix.SV_decomp_mod
|
12
|
+
reduce_dimensions!(sigma, rank)
|
13
|
+
sigma = GSL::Matrix.diagonal(sigma)
|
12
14
|
|
13
|
-
|
15
|
+
GSL::Matrix.swap(matrix, u * sigma * v.transpose)
|
16
|
+
end
|
14
17
|
|
15
|
-
|
18
|
+
private
|
19
|
+
def reduce_dimensions!(vector, rank)
|
20
|
+
# the vector is already sorted (biggest to smallest), so we
|
21
|
+
# only have to zero the elements we do not want
|
22
|
+
|
23
|
+
if rank.nil?
|
24
|
+
rank = determine_rank(vector)
|
16
25
|
else
|
17
|
-
|
26
|
+
rank = valid_rank(vector, rank)
|
18
27
|
end
|
19
|
-
|
20
|
-
|
28
|
+
|
29
|
+
num_to_zero_out = vector.size - rank
|
30
|
+
vector[rank, num_to_zero_out] = 0
|
21
31
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
32
|
+
|
33
|
+
def determine_rank(vector)
|
34
|
+
if vector.size <= 15
|
35
|
+
# for less than 15 documents, n-1 is usually the best we
|
36
|
+
# can do. LSA generally works better with bigger data
|
37
|
+
# sets.
|
38
|
+
rank = vector.size - 1
|
39
|
+
elsif vector.size <= 1000
|
40
|
+
# ~500 is a value to work well for really big data sets,
|
41
|
+
# but for less than that, it probably is too big, so we
|
42
|
+
# go for n/3 in this case.
|
43
|
+
rank = vector.size / 3
|
44
|
+
else
|
45
|
+
# if we have more than 1000 documents, using the magical
|
46
|
+
# number 500 (which can be found in various documents)
|
47
|
+
# seems to be the best guess for now.
|
48
|
+
rank = 500
|
27
49
|
end
|
28
|
-
matrix
|
29
|
-
end
|
30
|
-
|
31
|
-
def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
|
32
|
-
(diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
|
33
50
|
end
|
34
51
|
|
35
|
-
def
|
36
|
-
|
52
|
+
def valid_rank(vector, rank)
|
53
|
+
if rank <= 0
|
54
|
+
# for negative ranks, keep that many dimensions
|
55
|
+
rank = vector.size + rank
|
56
|
+
elsif rank > vector.size
|
57
|
+
# if the rank is > the vector size, limit it to that
|
58
|
+
rank = vector.size
|
59
|
+
else
|
60
|
+
rank
|
61
|
+
end
|
37
62
|
end
|
38
63
|
|
39
64
|
end
|
40
65
|
end
|
41
66
|
end
|
42
|
-
end
|
67
|
+
end
|