similarity 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/similarity.rb +4 -0
- data/lib/similarity/corpus.rb +85 -0
- data/lib/similarity/document.rb +56 -0
- data/lib/similarity/term_document_matrix.rb +33 -0
- metadata +104 -0
data/lib/similarity.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'gsl'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
attr_reader :terms, :documents
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@terms = {}
|
8
|
+
@documents = []
|
9
|
+
@term_document_matrix = nil
|
10
|
+
@similarity_matrix = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def document_count
|
14
|
+
@documents.size
|
15
|
+
end
|
16
|
+
|
17
|
+
def <<(document)
|
18
|
+
document.terms.uniq.each do |term|
|
19
|
+
if @terms[term]
|
20
|
+
@terms[term] += 1
|
21
|
+
else
|
22
|
+
@terms[term] = 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@documents << document
|
26
|
+
end
|
27
|
+
|
28
|
+
def remove_infrequent_terms!(percentage)
|
29
|
+
number_of_docs = document_count.to_f
|
30
|
+
@terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) < percentage}
|
31
|
+
@term_document_matrix = nil
|
32
|
+
@similarity_matrix = nil
|
33
|
+
end
|
34
|
+
|
35
|
+
def remove_frequent_terms!(percentage)
|
36
|
+
number_of_docs = document_count.to_f
|
37
|
+
@terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) > percentage}
|
38
|
+
@term_document_matrix = nil
|
39
|
+
@similarity_matrix = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def inverse_document_frequency(term)
|
43
|
+
puts "#{document_count} / (1 + #{document_count_for_term(term)})" if $DEBUG
|
44
|
+
Math.log(document_count.to_f / (1 + document_count_for_term(term)))
|
45
|
+
end
|
46
|
+
|
47
|
+
def document_count_for_term(term)
|
48
|
+
if @terms[term]
|
49
|
+
@terms[term]
|
50
|
+
else
|
51
|
+
0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def similarity_matrix
|
56
|
+
if @similarity_matrix
|
57
|
+
return @similarity_matrix
|
58
|
+
else
|
59
|
+
@similarity_matrix = term_document_matrix.similarity_matrix
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def term_document_matrix
|
64
|
+
if @term_document_matrix
|
65
|
+
return @term_document_matrix
|
66
|
+
else
|
67
|
+
@term_document_matrix = TermDocumentMatrix.new(self)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def weights(document)
|
72
|
+
idx = @documents.index(document)
|
73
|
+
terms = @terms.to_a.map {|term| term.first}
|
74
|
+
weights = term_document_matrix.col(idx).to_a
|
75
|
+
|
76
|
+
# create array of array pairs of terms and weights
|
77
|
+
term_weight_pairs = terms.zip(weights)
|
78
|
+
|
79
|
+
# remove zero weights
|
80
|
+
term_weight_pairs.reject! {|pair| pair[1].zero?}
|
81
|
+
|
82
|
+
# sort in descending order
|
83
|
+
term_weight_pairs.sort {|x,y| y[1] <=> x[1]}
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class Document
|
2
|
+
attr_reader :content, :id
|
3
|
+
|
4
|
+
def initialize(hash_args)
|
5
|
+
content = hash_args[:content]
|
6
|
+
if content && !content.empty?
|
7
|
+
@content = content
|
8
|
+
@term_frequency = nil
|
9
|
+
@terms = nil
|
10
|
+
else
|
11
|
+
raise ArgumentError, "text cannot be nil or blank"
|
12
|
+
end
|
13
|
+
|
14
|
+
id = hash_args[:id]
|
15
|
+
if id && !id.nil?
|
16
|
+
@id = id
|
17
|
+
else
|
18
|
+
@id = self.object_id
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def terms
|
23
|
+
@terms ||=
|
24
|
+
@content.gsub(/(\d|\s|\W)+/, ' ').
|
25
|
+
split(/\s/).map { |term| term.downcase }
|
26
|
+
end
|
27
|
+
|
28
|
+
def term_frequencies
|
29
|
+
@term_frequencies ||= calculate_term_frequencies
|
30
|
+
end
|
31
|
+
|
32
|
+
def calculate_term_frequencies
|
33
|
+
tf = {}
|
34
|
+
terms.each do |term|
|
35
|
+
if tf[term]
|
36
|
+
tf[term] += 1
|
37
|
+
else
|
38
|
+
tf[term] = 1
|
39
|
+
end
|
40
|
+
end
|
41
|
+
total_number_of_terms = terms.size.to_f
|
42
|
+
tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) }
|
43
|
+
end
|
44
|
+
|
45
|
+
def term_frequency(term)
|
46
|
+
if tf = term_frequencies[term]
|
47
|
+
tf
|
48
|
+
else
|
49
|
+
0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def has_term?(term)
|
54
|
+
terms.include? term
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'gsl'
|
2
|
+
|
3
|
+
class TermDocumentMatrix
|
4
|
+
attr_reader :matrix, :labels
|
5
|
+
|
6
|
+
def initialize(corpus)
|
7
|
+
@matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
|
8
|
+
|
9
|
+
corpus.documents.each_with_index do |document, document_index|
|
10
|
+
corpus.terms.each_with_index do |term, term_index|
|
11
|
+
term = term.first
|
12
|
+
idf = corpus.inverse_document_frequency(term)
|
13
|
+
weight = document.term_frequency(term) * idf
|
14
|
+
@matrix[term_index, document_index] = weight
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
@matrix.each_col { |col| col.div!(col.norm) }
|
19
|
+
@labels = corpus.terms.to_a.map {|e| e[0]}
|
20
|
+
end
|
21
|
+
|
22
|
+
def similarity_matrix
|
23
|
+
self.matrix.transpose * self.matrix
|
24
|
+
end
|
25
|
+
|
26
|
+
def col(idx)
|
27
|
+
@matrix.col(idx)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_a
|
31
|
+
@matrix.to_a
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: similarity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.2.3
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Chris Lowis
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-05-25 00:00:00 +01:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: gsl
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "0"
|
36
|
+
type: :development
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: faker
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: ruby-graphviz
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: "0"
|
58
|
+
type: :development
|
59
|
+
version_requirements: *id004
|
60
|
+
description: |
|
61
|
+
Pure-ruby document similarity calculations using cosine similarity and TF-IDF weights
|
62
|
+
|
63
|
+
email: chris.lowis@bbc.co.uk
|
64
|
+
executables: []
|
65
|
+
|
66
|
+
extensions: []
|
67
|
+
|
68
|
+
extra_rdoc_files: []
|
69
|
+
|
70
|
+
files:
|
71
|
+
- lib/similarity/corpus.rb
|
72
|
+
- lib/similarity/document.rb
|
73
|
+
- lib/similarity/term_document_matrix.rb
|
74
|
+
- lib/similarity.rb
|
75
|
+
has_rdoc: true
|
76
|
+
homepage: ""
|
77
|
+
licenses: []
|
78
|
+
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
96
|
+
requirements: []
|
97
|
+
|
98
|
+
rubyforge_project: similarity
|
99
|
+
rubygems_version: 1.5.0
|
100
|
+
signing_key:
|
101
|
+
specification_version: 3
|
102
|
+
summary: Document similarity calculations using cosine similarity and TF-IDF weights
|
103
|
+
test_files: []
|
104
|
+
|