similarity 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,17 @@ class Corpus
68
68
  end
69
69
  end
70
70
 
71
+ def similar_documents(document)
72
+ index = documents.index(document)
73
+ return nil if index.nil?
74
+
75
+ results = documents.each_with_index.map do |doc, doc_index|
76
+ similarity = similarity_matrix[index, doc_index]
77
+ [doc, similarity]
78
+ end
79
+ results.sort { |a,b| b.last <=> a.last }
80
+ end
81
+
71
82
  def weights(document)
72
83
  idx = @documents.index(document)
73
84
  terms = @terms.to_a.map {|term| term.first}
@@ -1,17 +1,22 @@
1
1
  require 'gsl'
2
2
 
3
3
  class TermDocumentMatrix
4
- attr_reader :matrix, :labels
4
+ attr_reader :matrix, :labels, :number_of_terms, :number_of_documents, :non_zeros
5
5
 
6
6
  def initialize(corpus)
7
7
  @matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
8
+ @non_zeros = 0
9
+ @number_of_terms = corpus.terms.size
10
+ @number_of_documents = corpus.documents.size
8
11
 
9
- corpus.documents.each_with_index do |document, document_index|
10
- corpus.terms.each_with_index do |term, term_index|
11
- term = term.first
12
- idf = corpus.inverse_document_frequency(term)
12
+ corpus.terms.each_with_index do |term, term_index|
13
+ term = term.first
14
+ idf = corpus.inverse_document_frequency(term)
15
+
16
+ corpus.documents.each_with_index do |document, document_index|
13
17
  weight = document.term_frequency(term) * idf
14
18
  @matrix[term_index, document_index] = weight
19
+ @non_zeros += 1 unless weight.zero?
15
20
  end
16
21
  end
17
22
 
metadata CHANGED
@@ -1,104 +1,115 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: similarity
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.5
4
5
  prerelease:
5
- version: 0.2.4
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Chris Lowis
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-05-25 00:00:00 +01:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2011-05-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: gsl
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
24
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
25
22
  type: :runtime
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
31
33
  none: false
32
- requirements:
33
- - - ">="
34
- - !ruby/object:Gem::Version
35
- version: "0"
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
36
38
  type: :development
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: faker
40
39
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: faker
48
+ requirement: !ruby/object:Gem::Requirement
42
49
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: "0"
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
47
54
  type: :development
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: ruby-graphviz
51
55
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: ruby-graphviz
64
+ requirement: !ruby/object:Gem::Requirement
53
65
  none: false
54
- requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: "0"
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
58
70
  type: :development
59
- version_requirements: *id004
60
- description: |
61
- Document similarity calculations using cosine similarity and TF-IDF weights
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: ! 'Document similarity calculations using cosine similarity and TF-IDF
79
+ weights
62
80
 
81
+ '
63
82
  email: chris.lowis@bbc.co.uk
64
83
  executables: []
65
-
66
84
  extensions: []
67
-
68
85
  extra_rdoc_files: []
69
-
70
- files:
86
+ files:
71
87
  - lib/similarity/corpus.rb
72
88
  - lib/similarity/document.rb
73
89
  - lib/similarity/term_document_matrix.rb
74
90
  - lib/similarity.rb
75
- has_rdoc: true
76
- homepage: ""
91
+ homepage: ''
77
92
  licenses: []
78
-
79
93
  post_install_message:
80
94
  rdoc_options: []
81
-
82
- require_paths:
95
+ require_paths:
83
96
  - lib
84
- required_ruby_version: !ruby/object:Gem::Requirement
97
+ required_ruby_version: !ruby/object:Gem::Requirement
85
98
  none: false
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: "0"
90
- required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
104
  none: false
92
- requirements:
93
- - - ">="
94
- - !ruby/object:Gem::Version
95
- version: "0"
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
96
109
  requirements: []
97
-
98
110
  rubyforge_project: similarity
99
- rubygems_version: 1.5.0
111
+ rubygems_version: 1.8.23
100
112
  signing_key:
101
113
  specification_version: 3
102
114
  summary: Document similarity calculations using cosine similarity and TF-IDF weights
103
115
  test_files: []
104
-