similarity 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/similarity.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'similarity/corpus'
2
+ require 'similarity/document'
3
+ require 'similarity/term_document_matrix'
4
+
@@ -0,0 +1,85 @@
1
+ require 'gsl'
2
+
3
+ class Corpus
4
+ attr_reader :terms, :documents
5
+
6
+ def initialize
7
+ @terms = {}
8
+ @documents = []
9
+ @term_document_matrix = nil
10
+ @similarity_matrix = nil
11
+ end
12
+
13
+ def document_count
14
+ @documents.size
15
+ end
16
+
17
+ def <<(document)
18
+ document.terms.uniq.each do |term|
19
+ if @terms[term]
20
+ @terms[term] += 1
21
+ else
22
+ @terms[term] = 1
23
+ end
24
+ end
25
+ @documents << document
26
+ end
27
+
28
+ def remove_infrequent_terms!(percentage)
29
+ number_of_docs = document_count.to_f
30
+ @terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) < percentage}
31
+ @term_document_matrix = nil
32
+ @similarity_matrix = nil
33
+ end
34
+
35
+ def remove_frequent_terms!(percentage)
36
+ number_of_docs = document_count.to_f
37
+ @terms = terms.delete_if {|term, count| (count.to_f / number_of_docs) > percentage}
38
+ @term_document_matrix = nil
39
+ @similarity_matrix = nil
40
+ end
41
+
42
+ def inverse_document_frequency(term)
43
+ puts "#{document_count} / (1 + #{document_count_for_term(term)})" if $DEBUG
44
+ Math.log(document_count.to_f / (1 + document_count_for_term(term)))
45
+ end
46
+
47
+ def document_count_for_term(term)
48
+ if @terms[term]
49
+ @terms[term]
50
+ else
51
+ 0
52
+ end
53
+ end
54
+
55
+ def similarity_matrix
56
+ if @similarity_matrix
57
+ return @similarity_matrix
58
+ else
59
+ @similarity_matrix = term_document_matrix.similarity_matrix
60
+ end
61
+ end
62
+
63
+ def term_document_matrix
64
+ if @term_document_matrix
65
+ return @term_document_matrix
66
+ else
67
+ @term_document_matrix = TermDocumentMatrix.new(self)
68
+ end
69
+ end
70
+
71
+ def weights(document)
72
+ idx = @documents.index(document)
73
+ terms = @terms.to_a.map {|term| term.first}
74
+ weights = term_document_matrix.col(idx).to_a
75
+
76
+ # create array of array pairs of terms and weights
77
+ term_weight_pairs = terms.zip(weights)
78
+
79
+ # remove zero weights
80
+ term_weight_pairs.reject! {|pair| pair[1].zero?}
81
+
82
+ # sort in descending order
83
+ term_weight_pairs.sort {|x,y| y[1] <=> x[1]}
84
+ end
85
+ end
@@ -0,0 +1,56 @@
1
+ class Document
2
+ attr_reader :content, :id
3
+
4
+ def initialize(hash_args)
5
+ content = hash_args[:content]
6
+ if content && !content.empty?
7
+ @content = content
8
+ @term_frequency = nil
9
+ @terms = nil
10
+ else
11
+ raise ArgumentError, "text cannot be nil or blank"
12
+ end
13
+
14
+ id = hash_args[:id]
15
+ if id && !id.nil?
16
+ @id = id
17
+ else
18
+ @id = self.object_id
19
+ end
20
+ end
21
+
22
+ def terms
23
+ @terms ||=
24
+ @content.gsub(/(\d|\s|\W)+/, ' ').
25
+ split(/\s/).map { |term| term.downcase }
26
+ end
27
+
28
+ def term_frequencies
29
+ @term_frequencies ||= calculate_term_frequencies
30
+ end
31
+
32
+ def calculate_term_frequencies
33
+ tf = {}
34
+ terms.each do |term|
35
+ if tf[term]
36
+ tf[term] += 1
37
+ else
38
+ tf[term] = 1
39
+ end
40
+ end
41
+ total_number_of_terms = terms.size.to_f
42
+ tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) }
43
+ end
44
+
45
+ def term_frequency(term)
46
+ if tf = term_frequencies[term]
47
+ tf
48
+ else
49
+ 0
50
+ end
51
+ end
52
+
53
+ def has_term?(term)
54
+ terms.include? term
55
+ end
56
+ end
@@ -0,0 +1,33 @@
1
+ require 'gsl'
2
+
3
+ class TermDocumentMatrix
4
+ attr_reader :matrix, :labels
5
+
6
+ def initialize(corpus)
7
+ @matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
8
+
9
+ corpus.documents.each_with_index do |document, document_index|
10
+ corpus.terms.each_with_index do |term, term_index|
11
+ term = term.first
12
+ idf = corpus.inverse_document_frequency(term)
13
+ weight = document.term_frequency(term) * idf
14
+ @matrix[term_index, document_index] = weight
15
+ end
16
+ end
17
+
18
+ @matrix.each_col { |col| col.div!(col.norm) }
19
+ @labels = corpus.terms.to_a.map {|e| e[0]}
20
+ end
21
+
22
+ def similarity_matrix
23
+ self.matrix.transpose * self.matrix
24
+ end
25
+
26
+ def col(idx)
27
+ @matrix.col(idx)
28
+ end
29
+
30
+ def to_a
31
+ @matrix.to_a
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: similarity
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.2.3
6
+ platform: ruby
7
+ authors:
8
+ - Chris Lowis
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-05-25 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: gsl
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: "0"
36
+ type: :development
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: faker
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ type: :development
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: ruby-graphviz
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ type: :development
59
+ version_requirements: *id004
60
+ description: |
61
+ Pure-ruby document similarity calculations using cosine similarity and TF-IDF weights
62
+
63
+ email: chris.lowis@bbc.co.uk
64
+ executables: []
65
+
66
+ extensions: []
67
+
68
+ extra_rdoc_files: []
69
+
70
+ files:
71
+ - lib/similarity/corpus.rb
72
+ - lib/similarity/document.rb
73
+ - lib/similarity/term_document_matrix.rb
74
+ - lib/similarity.rb
75
+ has_rdoc: true
76
+ homepage: ""
77
+ licenses: []
78
+
79
+ post_install_message:
80
+ rdoc_options: []
81
+
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
96
+ requirements: []
97
+
98
+ rubyforge_project: similarity
99
+ rubygems_version: 1.5.0
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: Document similarity calculations using cosine similarity and TF-IDF weights
103
+ test_files: []
104
+