rsemantic 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/{README.txt → README.md} +19 -10
- data/lib/semantic.rb +8 -5
- data/lib/semantic/compare.rb +4 -1
- data/lib/semantic/corpus.rb +61 -0
- data/lib/semantic/document.rb +39 -0
- data/lib/semantic/matrix_transformer.rb +4 -5
- data/lib/semantic/parser.rb +22 -10
- data/lib/semantic/search.rb +22 -16
- data/lib/semantic/search_result.rb +16 -0
- data/lib/semantic/transform/lsa_transform.rb +47 -22
- data/lib/semantic/transform/tf_idf_transform.rb +12 -23
- data/lib/semantic/vector_space/builder.rb +29 -22
- data/lib/semantic/vector_space/model.rb +14 -13
- data/lib/semantic/version.rb +1 -1
- data/lib/tasks/rspec.rake +13 -0
- metadata +75 -107
- data/Manifest.txt +0 -38
- data/Rakefile +0 -9
- data/config/hoe.rb +0 -69
- data/config/requirements.rb +0 -15
- data/gem_tasks/deployment.rake +0 -34
- data/gem_tasks/environment.rake +0 -7
- data/gem_tasks/examples.rake +0 -29
- data/gem_tasks/fix_cr_lf.rake +0 -10
- data/gem_tasks/gemspec.rake +0 -6
- data/gem_tasks/rspec.rake +0 -33
- data/gem_tasks/website.rake +0 -17
- data/rsemantic.gemspec +0 -41
- data/spec/semantic/compare_spec.rb +0 -16
- data/spec/semantic/matrix_transformer_spec.rb +0 -51
- data/spec/semantic/parser_spec.rb +0 -34
- data/spec/semantic/search_spec.rb +0 -129
- data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
- data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
- data/spec/semantic/vector_space/builder_spec.rb +0 -44
- data/spec/semantic/vector_space/model_spec.rb +0 -22
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -7
data/{README.txt → README.md}
RENAMED
@@ -1,8 +1,6 @@
|
|
1
|
-
|
1
|
+
# Rsemantic
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
== DESCRIPTION:
|
3
|
+
[![Build Status](https://secure.travis-ci.org/josephwilk/rsemantic.png?branch=master)](http://travis-ci.org/josephwilk/rsemantic)
|
6
4
|
|
7
5
|
A Ruby document vector search with flexible matrix transforms. Current supported transforms:
|
8
6
|
|
@@ -11,20 +9,31 @@ A Ruby document vector search with flexible matrix transforms. Current supported
|
|
11
9
|
|
12
10
|
Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
13
11
|
|
14
|
-
|
12
|
+
## Requirements:
|
15
13
|
|
16
|
-
*
|
14
|
+
* GSL - http://www.gnu.org/software/gsl
|
17
15
|
* stemmer - http://rubyforge.org/projects/stemmer/
|
18
16
|
|
19
|
-
|
17
|
+
## INSTALL:
|
18
|
+
|
19
|
+
Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
|
20
|
+
See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
|
21
|
+
|
22
|
+
<pre><code>git clone git://github.com/josephwilk/rsemantic.git
|
23
|
+
cd rsemantic
|
24
|
+
brew install GSL
|
25
|
+
bundle install
|
26
|
+
</code></pre>
|
20
27
|
|
21
|
-
|
28
|
+
## Contributors
|
29
|
+
* @josephwilk
|
30
|
+
* @dominikhonnef
|
22
31
|
|
23
|
-
|
32
|
+
## LICENSE
|
24
33
|
|
25
34
|
(The MIT License)
|
26
35
|
|
27
|
-
Copyright (c) 2008 Joseph Wilk
|
36
|
+
Copyright (c) 2008-2012 Joseph Wilk
|
28
37
|
|
29
38
|
Permission is hereby granted, free of charge, to any person obtaining
|
30
39
|
a copy of this software and associated documentation files (the
|
data/lib/semantic.rb
CHANGED
@@ -9,10 +9,13 @@ require "semantic/search"
|
|
9
9
|
require "semantic/transform"
|
10
10
|
require "semantic/version"
|
11
11
|
|
12
|
+
require "semantic/corpus"
|
13
|
+
require "semantic/document"
|
14
|
+
require "semantic/search_result"
|
15
|
+
|
12
16
|
require 'rubygems'
|
13
|
-
require '
|
14
|
-
|
15
|
-
#A processor for removing the commoner morphological and inflexional endings from words in English
|
17
|
+
require 'gsl'
|
18
|
+
|
16
19
|
require 'stemmer'
|
17
20
|
require 'logger'
|
18
21
|
|
@@ -21,7 +24,7 @@ module Semantic
|
|
21
24
|
class << self
|
22
25
|
attr_writer :logger
|
23
26
|
end
|
24
|
-
|
27
|
+
|
25
28
|
def self.logger
|
26
29
|
return @logger if @logger
|
27
30
|
@logger = Logger.new(STDOUT)
|
@@ -29,5 +32,5 @@ module Semantic
|
|
29
32
|
@logger.level = Logger::ERROR
|
30
33
|
@logger
|
31
34
|
end
|
32
|
-
|
35
|
+
|
33
36
|
end
|
data/lib/semantic/compare.rb
CHANGED
@@ -9,7 +9,10 @@ module Semantic
|
|
9
9
|
|
10
10
|
def cosine(vector1, vector2)
|
11
11
|
unless vector2.nil? or vector1.nil?
|
12
|
-
|
12
|
+
v1 = vector1.row
|
13
|
+
v2 = vector2
|
14
|
+
score = (v1 * v2) / (vector1.norm * vector2.norm)
|
15
|
+
score.nan? ? 0.0 : score
|
13
16
|
end
|
14
17
|
end
|
15
18
|
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Semantic
|
2
|
+
class Corpus
|
3
|
+
# @return [Array<Document>]
|
4
|
+
attr_reader :documents
|
5
|
+
|
6
|
+
# @param [Array<Document>] documents The {Document documents} to
|
7
|
+
# index
|
8
|
+
# @param [Hash] options
|
9
|
+
# TODO document options
|
10
|
+
def initialize(documents = [], options = {})
|
11
|
+
@documents = documents
|
12
|
+
@options = options
|
13
|
+
@search = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Adds a new {Document document} to the index.
|
17
|
+
#
|
18
|
+
# @param [Document] document
|
19
|
+
# @return [void]
|
20
|
+
def add_document(document)
|
21
|
+
@documents << document
|
22
|
+
document.corpora << self
|
23
|
+
end
|
24
|
+
alias_method :<<, :add_document
|
25
|
+
|
26
|
+
# Build the index. This is required to be able to search for words
|
27
|
+
# or compute related documents.
|
28
|
+
#
|
29
|
+
# If you add new documents, you have to rebuild the index.
|
30
|
+
#
|
31
|
+
# @return [void]
|
32
|
+
def build_index
|
33
|
+
@search = Semantic::Search.new(@documents.map(&:text), @options)
|
34
|
+
end
|
35
|
+
|
36
|
+
def search(*words)
|
37
|
+
# TODO raise if no index built yet
|
38
|
+
results = @search.search(words)
|
39
|
+
results.map.with_index { |result, index|
|
40
|
+
document = @documents[index]
|
41
|
+
Semantic::SearchResult.new(document, result)
|
42
|
+
}.sort
|
43
|
+
end
|
44
|
+
|
45
|
+
def find_related_document(document)
|
46
|
+
@search.related(@documents.index(document)).map.with_index { |result, index|
|
47
|
+
document = @documents[index]
|
48
|
+
Semantic::SearchResult.new(document, result)
|
49
|
+
}.sort
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_keywords(document, num = 5)
|
53
|
+
# TODO allow limiting keywords to words that occur in this document
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_s
|
58
|
+
"#<%s %d documents, @options=%s>" % [self.class.name, @documents.size, @options.inspect]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Semantic
|
2
|
+
class Document
|
3
|
+
attr_reader :text
|
4
|
+
attr_reader :attributes
|
5
|
+
attr_reader :corpora
|
6
|
+
def initialize(text, attributes = {})
|
7
|
+
if text.respond_to?(:read)
|
8
|
+
@text = text.read
|
9
|
+
else
|
10
|
+
@text = text
|
11
|
+
end
|
12
|
+
|
13
|
+
@attributes = attributes
|
14
|
+
@corpora = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"#<%s @attributes=%s>" % [self.class.name, @attributes.inspect]
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](key)
|
22
|
+
@attributes[key]
|
23
|
+
end
|
24
|
+
|
25
|
+
# @todo document that it has to be part of at least one corpus
|
26
|
+
def related
|
27
|
+
results = {}
|
28
|
+
@corpora.each do |corpus|
|
29
|
+
results[corpus] = corpus.find_related_document(self)
|
30
|
+
end
|
31
|
+
|
32
|
+
results
|
33
|
+
end
|
34
|
+
|
35
|
+
def keywords(num = 5)
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,9 +1,8 @@
|
|
1
1
|
module Semantic
|
2
2
|
class MatrixTransformer
|
3
3
|
|
4
|
-
def initialize(
|
5
|
-
@transforms =
|
6
|
-
@options = options
|
4
|
+
def initialize(transforms)
|
5
|
+
@transforms = transforms
|
7
6
|
end
|
8
7
|
|
9
8
|
def apply_transforms(vector_space_model)
|
@@ -11,9 +10,9 @@ module Semantic
|
|
11
10
|
begin
|
12
11
|
transform_class = Semantic::Transform.const_get(transform)
|
13
12
|
Semantic.logger.info("Applying #{transform} transform")
|
14
|
-
|
13
|
+
transform_class.transform!(vector_space_model.matrix)
|
15
14
|
Semantic.logger.info(vector_space_model)
|
16
|
-
rescue
|
15
|
+
rescue => e
|
17
16
|
Semantic.logger.error("Error: Cannot perform transform: #{transform}")
|
18
17
|
Semantic.logger.error(e)
|
19
18
|
end
|
data/lib/semantic/parser.rb
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
require 'stemmer'
|
2
|
-
|
2
|
+
require "set"
|
3
3
|
module Semantic
|
4
4
|
class Parser
|
5
5
|
|
6
|
-
def initialize
|
7
|
-
#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
8
|
-
#TODO: nicer way to reference stop file location?
|
9
|
-
|
10
|
-
|
6
|
+
def initialize(options = {})
|
7
|
+
# English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
8
|
+
# TODO: nicer way to reference stop file location?
|
9
|
+
@filter_stop_words = options[:filter_stop_words]
|
10
|
+
@stem_words = options[:stem_words]
|
11
|
+
if @filter_stop_words
|
12
|
+
File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
|
13
|
+
@stopwords = Set.new(file.read().split())
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
@@ -16,7 +20,7 @@ module Semantic
|
|
16
20
|
remove_stop_words(word_list)
|
17
21
|
end
|
18
22
|
|
19
|
-
#remove any nasty grammar tokens from string
|
23
|
+
# remove any nasty grammar tokens from string
|
20
24
|
def clean(string)
|
21
25
|
string = string.gsub(".","")
|
22
26
|
string = string.gsub(/\s+/," ")
|
@@ -24,16 +28,24 @@ module Semantic
|
|
24
28
|
return string
|
25
29
|
end
|
26
30
|
|
27
|
-
#stop words are common words which have no search value
|
31
|
+
# stop words are common words which have no search value
|
28
32
|
def remove_stop_words(list)
|
29
|
-
|
33
|
+
if @filter_stop_words
|
34
|
+
list.select {|word| !@stopwords.include?(word) }
|
35
|
+
else
|
36
|
+
list
|
37
|
+
end
|
30
38
|
end
|
31
39
|
|
32
40
|
def tokenise_and_stem(string)
|
33
41
|
string = clean(string)
|
34
42
|
words = string.split(" ")
|
35
43
|
|
36
|
-
|
44
|
+
if @stem_words
|
45
|
+
words.map {|word| Stemmer.stem_word(word) }
|
46
|
+
else
|
47
|
+
words
|
48
|
+
end
|
37
49
|
end
|
38
50
|
|
39
51
|
end
|
data/lib/semantic/search.rb
CHANGED
@@ -1,35 +1,41 @@
|
|
1
1
|
module Semantic
|
2
2
|
class Search
|
3
3
|
|
4
|
-
def initialize(documents, options={})
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
def initialize(documents, options = {})
|
5
|
+
options = {
|
6
|
+
:transforms => [:TFIDF, :LSA],
|
7
|
+
:verbose => false,
|
8
|
+
:filter_stop_words => true,
|
9
|
+
:stem_words => true,
|
10
|
+
}.merge(options)
|
11
|
+
Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
|
12
|
+
|
13
|
+
|
14
|
+
@builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
|
15
|
+
@matrix_transformer = MatrixTransformer.new(options[:transforms])
|
9
16
|
|
10
17
|
@vector_space_model = @builder.build_document_matrix(documents)
|
11
|
-
|
18
|
+
|
12
19
|
Semantic.logger.info(@vector_space_model)
|
13
|
-
|
20
|
+
|
14
21
|
@vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
|
15
22
|
end
|
16
|
-
|
17
|
-
def related(
|
23
|
+
|
24
|
+
def related(document_id)
|
18
25
|
ratings = []
|
19
|
-
|
20
|
-
ratings << Compare.similarity(@vector_space_model.column(
|
26
|
+
@vector_space_model.each_column do |column|
|
27
|
+
ratings << Compare.similarity(@vector_space_model.column(document_id), column)
|
21
28
|
end
|
22
29
|
ratings
|
23
30
|
end
|
24
31
|
|
25
|
-
def search(
|
32
|
+
def search(search_list)
|
26
33
|
ratings = []
|
27
|
-
query_vector = @builder.build_query_vector(
|
28
|
-
|
29
|
-
ratings << Compare.similarity(query_vector,
|
34
|
+
query_vector = @builder.build_query_vector(search_list)
|
35
|
+
@vector_space_model.each_column do |column|
|
36
|
+
ratings << Compare.similarity(query_vector.col, column)
|
30
37
|
end
|
31
38
|
ratings
|
32
39
|
end
|
33
|
-
|
34
40
|
end
|
35
41
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Semantic
|
2
|
+
class SearchResult
|
3
|
+
include Comparable
|
4
|
+
|
5
|
+
attr_reader :document
|
6
|
+
attr_reader :score
|
7
|
+
def initialize(document, score)
|
8
|
+
@document = document
|
9
|
+
@score = score
|
10
|
+
end
|
11
|
+
|
12
|
+
def <=>(other)
|
13
|
+
@score <=> other.score
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -4,39 +4,64 @@ module Semantic
|
|
4
4
|
|
5
5
|
class << self
|
6
6
|
|
7
|
-
def transform(matrix,
|
8
|
-
|
7
|
+
def transform!(matrix, rank = nil)
|
8
|
+
# TODO configurable rank
|
9
|
+
columns = matrix.size2
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
u, v, sigma = matrix.SV_decomp_mod
|
12
|
+
reduce_dimensions!(sigma, rank)
|
13
|
+
sigma = GSL::Matrix.diagonal(sigma)
|
12
14
|
|
13
|
-
|
15
|
+
GSL::Matrix.swap(matrix, u * sigma * v.transpose)
|
16
|
+
end
|
14
17
|
|
15
|
-
|
18
|
+
private
|
19
|
+
def reduce_dimensions!(vector, rank)
|
20
|
+
# the vector is already sorted (biggest to smallest), so we
|
21
|
+
# only have to zero the elements we do not want
|
22
|
+
|
23
|
+
if rank.nil?
|
24
|
+
rank = determine_rank(vector)
|
16
25
|
else
|
17
|
-
|
26
|
+
rank = valid_rank(vector, rank)
|
18
27
|
end
|
19
|
-
|
20
|
-
|
28
|
+
|
29
|
+
num_to_zero_out = vector.size - rank
|
30
|
+
vector[rank, num_to_zero_out] = 0
|
21
31
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
32
|
+
|
33
|
+
def determine_rank(vector)
|
34
|
+
if vector.size <= 15
|
35
|
+
# for less than 15 documents, n-1 is usually the best we
|
36
|
+
# can do. LSA generally works better with bigger data
|
37
|
+
# sets.
|
38
|
+
rank = vector.size - 1
|
39
|
+
elsif vector.size <= 1000
|
40
|
+
# ~500 is a value to work well for really big data sets,
|
41
|
+
# but for less than that, it probably is too big, so we
|
42
|
+
# go for n/3 in this case.
|
43
|
+
rank = vector.size / 3
|
44
|
+
else
|
45
|
+
# if we have more than 1000 documents, using the magical
|
46
|
+
# number 500 (which can be found in various documents)
|
47
|
+
# seems to be the best guess for now.
|
48
|
+
rank = 500
|
27
49
|
end
|
28
|
-
matrix
|
29
|
-
end
|
30
|
-
|
31
|
-
def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
|
32
|
-
(diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
|
33
50
|
end
|
34
51
|
|
35
|
-
def
|
36
|
-
|
52
|
+
def valid_rank(vector, rank)
|
53
|
+
if rank <= 0
|
54
|
+
# for negative ranks, keep that many dimensions
|
55
|
+
rank = vector.size + rank
|
56
|
+
elsif rank > vector.size
|
57
|
+
# if the rank is > the vector size, limit it to that
|
58
|
+
rank = vector.size
|
59
|
+
else
|
60
|
+
rank
|
61
|
+
end
|
37
62
|
end
|
38
63
|
|
39
64
|
end
|
40
65
|
end
|
41
66
|
end
|
42
|
-
end
|
67
|
+
end
|