similarity 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/similarity.rb +4 -0
- data/lib/similarity/corpus.rb +85 -0
- data/lib/similarity/document.rb +56 -0
- data/lib/similarity/term_document_matrix.rb +33 -0
- metadata +104 -0
data/lib/similarity.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'gsl'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
attr_reader :terms, :documents
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@terms = {}
|
8
|
+
@documents = []
|
9
|
+
@term_document_matrix = nil
|
10
|
+
@similarity_matrix = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def document_count
|
14
|
+
@documents.size
|
15
|
+
end
|
16
|
+
|
17
|
+
def <<(document)
|
18
|
+
document.terms.uniq.each do |term|
|
19
|
+
if @terms[term]
|
20
|
+
@terms[term] += 1
|
21
|
+
else
|
22
|
+
@terms[term] = 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@documents << document
|
26
|
+
end
|
27
|
+
|
28
|
+
def remove_infrequent_terms!(percentage)
|
29
|
+
number_of_docs = document_count.to_f
|
30
|
+
@terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) < percentage}
|
31
|
+
@term_document_matrix = nil
|
32
|
+
@similarity_matrix = nil
|
33
|
+
end
|
34
|
+
|
35
|
+
def remove_frequent_terms!(percentage)
|
36
|
+
number_of_docs = document_count.to_f
|
37
|
+
@terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) > percentage}
|
38
|
+
@term_document_matrix = nil
|
39
|
+
@similarity_matrix = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def inverse_document_frequency(term)
|
43
|
+
puts "#{document_count} / (1 + #{document_count_for_term(term)})" if $DEBUG
|
44
|
+
Math.log(document_count.to_f / (1 + document_count_for_term(term)))
|
45
|
+
end
|
46
|
+
|
47
|
+
def document_count_for_term(term)
|
48
|
+
if @terms[term]
|
49
|
+
@terms[term]
|
50
|
+
else
|
51
|
+
0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def similarity_matrix
|
56
|
+
if @similarity_matrix
|
57
|
+
return @similarity_matrix
|
58
|
+
else
|
59
|
+
@similarity_matrix = term_document_matrix.similarity_matrix
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def term_document_matrix
|
64
|
+
if @term_document_matrix
|
65
|
+
return @term_document_matrix
|
66
|
+
else
|
67
|
+
@term_document_matrix = TermDocumentMatrix.new(self)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def weights(document)
|
72
|
+
idx = @documents.index(document)
|
73
|
+
terms = @terms.to_a.map {|term| term.first}
|
74
|
+
weights = term_document_matrix.col(idx).to_a
|
75
|
+
|
76
|
+
# create array of array pairs of terms and weights
|
77
|
+
term_weight_pairs = terms.zip(weights)
|
78
|
+
|
79
|
+
# remove zero weights
|
80
|
+
term_weight_pairs.reject! {|pair| pair[1].zero?}
|
81
|
+
|
82
|
+
# sort in descending order
|
83
|
+
term_weight_pairs.sort {|x,y| y[1] <=> x[1]}
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class Document
|
2
|
+
attr_reader :content, :id
|
3
|
+
|
4
|
+
def initialize(hash_args)
|
5
|
+
content = hash_args[:content]
|
6
|
+
if content && !content.empty?
|
7
|
+
@content = content
|
8
|
+
@term_frequency = nil
|
9
|
+
@terms = nil
|
10
|
+
else
|
11
|
+
raise ArgumentError, "text cannot be nil or blank"
|
12
|
+
end
|
13
|
+
|
14
|
+
id = hash_args[:id]
|
15
|
+
if id && !id.nil?
|
16
|
+
@id = id
|
17
|
+
else
|
18
|
+
@id = self.object_id
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def terms
|
23
|
+
@terms ||=
|
24
|
+
@content.gsub(/(\d|\s|\W)+/, ' ').
|
25
|
+
split(/\s/).map { |term| term.downcase }
|
26
|
+
end
|
27
|
+
|
28
|
+
def term_frequencies
|
29
|
+
@term_frequencies ||= calculate_term_frequencies
|
30
|
+
end
|
31
|
+
|
32
|
+
def calculate_term_frequencies
|
33
|
+
tf = {}
|
34
|
+
terms.each do |term|
|
35
|
+
if tf[term]
|
36
|
+
tf[term] += 1
|
37
|
+
else
|
38
|
+
tf[term] = 1
|
39
|
+
end
|
40
|
+
end
|
41
|
+
total_number_of_terms = terms.size.to_f
|
42
|
+
tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) }
|
43
|
+
end
|
44
|
+
|
45
|
+
def term_frequency(term)
|
46
|
+
if tf = term_frequencies[term]
|
47
|
+
tf
|
48
|
+
else
|
49
|
+
0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def has_term?(term)
|
54
|
+
terms.include? term
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'gsl'
|
2
|
+
|
3
|
+
class TermDocumentMatrix
|
4
|
+
attr_reader :matrix, :labels
|
5
|
+
|
6
|
+
def initialize(corpus)
|
7
|
+
@matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
|
8
|
+
|
9
|
+
corpus.documents.each_with_index do |document, document_index|
|
10
|
+
corpus.terms.each_with_index do |term, term_index|
|
11
|
+
term = term.first
|
12
|
+
idf = corpus.inverse_document_frequency(term)
|
13
|
+
weight = document.term_frequency(term) * idf
|
14
|
+
@matrix[term_index, document_index] = weight
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
@matrix.each_col { |col| col.div!(col.norm) }
|
19
|
+
@labels = corpus.terms.to_a.map {|e| e[0]}
|
20
|
+
end
|
21
|
+
|
22
|
+
def similarity_matrix
|
23
|
+
self.matrix.transpose * self.matrix
|
24
|
+
end
|
25
|
+
|
26
|
+
def col(idx)
|
27
|
+
@matrix.col(idx)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_a
|
31
|
+
@matrix.to_a
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: similarity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.2.3
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Chris Lowis
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-05-25 00:00:00 +01:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: gsl
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "0"
|
36
|
+
type: :development
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: faker
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: ruby-graphviz
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: "0"
|
58
|
+
type: :development
|
59
|
+
version_requirements: *id004
|
60
|
+
description: |
|
61
|
+
Pure-ruby document similarity calculations using cosine similarity and TF-IDF weights
|
62
|
+
|
63
|
+
email: chris.lowis@bbc.co.uk
|
64
|
+
executables: []
|
65
|
+
|
66
|
+
extensions: []
|
67
|
+
|
68
|
+
extra_rdoc_files: []
|
69
|
+
|
70
|
+
files:
|
71
|
+
- lib/similarity/corpus.rb
|
72
|
+
- lib/similarity/document.rb
|
73
|
+
- lib/similarity/term_document_matrix.rb
|
74
|
+
- lib/similarity.rb
|
75
|
+
has_rdoc: true
|
76
|
+
homepage: ""
|
77
|
+
licenses: []
|
78
|
+
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
96
|
+
requirements: []
|
97
|
+
|
98
|
+
rubyforge_project: similarity
|
99
|
+
rubygems_version: 1.5.0
|
100
|
+
signing_key:
|
101
|
+
specification_version: 3
|
102
|
+
summary: Document similarity calculations using cosine similarity and TF-IDF weights
|
103
|
+
test_files: []
|
104
|
+
|