similarity 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/similarity.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'similarity/corpus'
2
+ require 'similarity/document'
3
+ require 'similarity/term_document_matrix'
4
+
@@ -0,0 +1,85 @@
1
+ require 'gsl'
2
+
3
+ class Corpus
4
+ attr_reader :terms, :documents
5
+
6
+ def initialize
7
+ @terms = {}
8
+ @documents = []
9
+ @term_document_matrix = nil
10
+ @similarity_matrix = nil
11
+ end
12
+
13
+ def document_count
14
+ @documents.size
15
+ end
16
+
17
+ def <<(document)
18
+ document.terms.uniq.each do |term|
19
+ if @terms[term]
20
+ @terms[term] += 1
21
+ else
22
+ @terms[term] = 1
23
+ end
24
+ end
25
+ @documents << document
26
+ end
27
+
28
+ def remove_infrequent_terms!(percentage)
29
+ number_of_docs = document_count.to_f
30
+ @terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) < percentage}
31
+ @term_document_matrix = nil
32
+ @similarity_matrix = nil
33
+ end
34
+
35
+ def remove_frequent_terms!(percentage)
36
+ number_of_docs = document_count.to_f
37
+ @terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) > percentage}
38
+ @term_document_matrix = nil
39
+ @similarity_matrix = nil
40
+ end
41
+
42
+ def inverse_document_frequency(term)
43
+ puts "#{document_count} / (1 + #{document_count_for_term(term)})" if $DEBUG
44
+ Math.log(document_count.to_f / (1 + document_count_for_term(term)))
45
+ end
46
+
47
+ def document_count_for_term(term)
48
+ if @terms[term]
49
+ @terms[term]
50
+ else
51
+ 0
52
+ end
53
+ end
54
+
55
+ def similarity_matrix
56
+ if @similarity_matrix
57
+ return @similarity_matrix
58
+ else
59
+ @similarity_matrix = term_document_matrix.similarity_matrix
60
+ end
61
+ end
62
+
63
+ def term_document_matrix
64
+ if @term_document_matrix
65
+ return @term_document_matrix
66
+ else
67
+ @term_document_matrix = TermDocumentMatrix.new(self)
68
+ end
69
+ end
70
+
71
+ def weights(document)
72
+ idx = @documents.index(document)
73
+ terms = @terms.to_a.map {|term| term.first}
74
+ weights = term_document_matrix.col(idx).to_a
75
+
76
+ # create array of array pairs of terms and weights
77
+ term_weight_pairs = terms.zip(weights)
78
+
79
+ # remove zero weights
80
+ term_weight_pairs.reject! {|pair| pair[1].zero?}
81
+
82
+ # sort in descending order
83
+ term_weight_pairs.sort {|x,y| y[1] <=> x[1]}
84
+ end
85
+ end
@@ -0,0 +1,56 @@
1
+ class Document
2
+ attr_reader :content, :id
3
+
4
+ def initialize(hash_args)
5
+ content = hash_args[:content]
6
+ if content && !content.empty?
7
+ @content = content
8
+ @term_frequency = nil
9
+ @terms = nil
10
+ else
11
+ raise ArgumentError, "text cannot be nil or blank"
12
+ end
13
+
14
+ id = hash_args[:id]
15
+ if id && !id.nil?
16
+ @id = id
17
+ else
18
+ @id = self.object_id
19
+ end
20
+ end
21
+
22
+ def terms
23
+ @terms ||=
24
+ @content.gsub(/(\d|\s|\W)+/, ' ').
25
+ split(/\s/).map { |term| term.downcase }
26
+ end
27
+
28
+ def term_frequencies
29
+ @term_frequencies ||= calculate_term_frequencies
30
+ end
31
+
32
+ def calculate_term_frequencies
33
+ tf = {}
34
+ terms.each do |term|
35
+ if tf[term]
36
+ tf[term] += 1
37
+ else
38
+ tf[term] = 1
39
+ end
40
+ end
41
+ total_number_of_terms = terms.size.to_f
42
+ tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) }
43
+ end
44
+
45
+ def term_frequency(term)
46
+ if tf = term_frequencies[term]
47
+ tf
48
+ else
49
+ 0
50
+ end
51
+ end
52
+
53
+ def has_term?(term)
54
+ terms.include? term
55
+ end
56
+ end
@@ -0,0 +1,33 @@
1
+ require 'gsl'
2
+
3
+ class TermDocumentMatrix
4
+ attr_reader :matrix, :labels
5
+
6
+ def initialize(corpus)
7
+ @matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
8
+
9
+ corpus.documents.each_with_index do |document, document_index|
10
+ corpus.terms.each_with_index do |term, term_index|
11
+ term = term.first
12
+ idf = corpus.inverse_document_frequency(term)
13
+ weight = document.term_frequency(term) * idf
14
+ @matrix[term_index, document_index] = weight
15
+ end
16
+ end
17
+
18
+ @matrix.each_col { |col| col.div!(col.norm) }
19
+ @labels = corpus.terms.to_a.map {|e| e[0]}
20
+ end
21
+
22
+ def similarity_matrix
23
+ self.matrix.transpose * self.matrix
24
+ end
25
+
26
+ def col(idx)
27
+ @matrix.col(idx)
28
+ end
29
+
30
+ def to_a
31
+ @matrix.to_a
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: similarity
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.2.3
6
+ platform: ruby
7
+ authors:
8
+ - Chris Lowis
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-05-25 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: gsl
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: "0"
36
+ type: :development
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: faker
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ type: :development
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: ruby-graphviz
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ type: :development
59
+ version_requirements: *id004
60
+ description: |
61
+ Pure-ruby document similarity calculations using cosine similarity and TF-IDF weights
62
+
63
+ email: chris.lowis@bbc.co.uk
64
+ executables: []
65
+
66
+ extensions: []
67
+
68
+ extra_rdoc_files: []
69
+
70
+ files:
71
+ - lib/similarity/corpus.rb
72
+ - lib/similarity/document.rb
73
+ - lib/similarity/term_document_matrix.rb
74
+ - lib/similarity.rb
75
+ has_rdoc: true
76
+ homepage: ""
77
+ licenses: []
78
+
79
+ post_install_message:
80
+ rdoc_options: []
81
+
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
96
+ requirements: []
97
+
98
+ rubyforge_project: similarity
99
+ rubygems_version: 1.5.0
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: Document similarity calculations using cosine similarity and TF-IDF weights
103
+ test_files: []
104
+