similarity 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -68,6 +68,17 @@ class Corpus
68
68
  end
69
69
  end
70
70
 
71
+ def similar_documents(document)
72
+ index = documents.index(document)
73
+ return nil if index.nil?
74
+
75
+ results = documents.each_with_index.map do |doc, doc_index|
76
+ similarity = similarity_matrix[index, doc_index]
77
+ [doc, similarity]
78
+ end
79
+ results.sort { |a,b| b.last <=> a.last }
80
+ end
81
+
71
82
  def weights(document)
72
83
  idx = @documents.index(document)
73
84
  terms = @terms.to_a.map {|term| term.first}
@@ -1,17 +1,22 @@
1
1
  require 'gsl'
2
2
 
3
3
  class TermDocumentMatrix
4
- attr_reader :matrix, :labels
4
+ attr_reader :matrix, :labels, :number_of_terms, :number_of_documents, :non_zeros
5
5
 
6
6
  def initialize(corpus)
7
7
  @matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
8
+ @non_zeros = 0
9
+ @number_of_terms = corpus.terms.size
10
+ @number_of_documents = corpus.documents.size
8
11
 
9
- corpus.documents.each_with_index do |document, document_index|
10
- corpus.terms.each_with_index do |term, term_index|
11
- term = term.first
12
- idf = corpus.inverse_document_frequency(term)
12
+ corpus.terms.each_with_index do |term, term_index|
13
+ term = term.first
14
+ idf = corpus.inverse_document_frequency(term)
15
+
16
+ corpus.documents.each_with_index do |document, document_index|
13
17
  weight = document.term_frequency(term) * idf
14
18
  @matrix[term_index, document_index] = weight
19
+ @non_zeros += 1 unless weight.zero?
15
20
  end
16
21
  end
17
22
 
metadata CHANGED
@@ -1,104 +1,115 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: similarity
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.5
4
5
  prerelease:
5
- version: 0.2.4
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Chris Lowis
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-05-25 00:00:00 +01:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2011-05-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: gsl
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
24
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
25
22
  type: :runtime
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
31
33
  none: false
32
- requirements:
33
- - - ">="
34
- - !ruby/object:Gem::Version
35
- version: "0"
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
36
38
  type: :development
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: faker
40
39
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: faker
48
+ requirement: !ruby/object:Gem::Requirement
42
49
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: "0"
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
47
54
  type: :development
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: ruby-graphviz
51
55
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: ruby-graphviz
64
+ requirement: !ruby/object:Gem::Requirement
53
65
  none: false
54
- requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: "0"
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
58
70
  type: :development
59
- version_requirements: *id004
60
- description: |
61
- Document similarity calculations using cosine similarity and TF-IDF weights
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: ! 'Document similarity calculations using cosine similarity and TF-IDF
79
+ weights
62
80
 
81
+ '
63
82
  email: chris.lowis@bbc.co.uk
64
83
  executables: []
65
-
66
84
  extensions: []
67
-
68
85
  extra_rdoc_files: []
69
-
70
- files:
86
+ files:
71
87
  - lib/similarity/corpus.rb
72
88
  - lib/similarity/document.rb
73
89
  - lib/similarity/term_document_matrix.rb
74
90
  - lib/similarity.rb
75
- has_rdoc: true
76
- homepage: ""
91
+ homepage: ''
77
92
  licenses: []
78
-
79
93
  post_install_message:
80
94
  rdoc_options: []
81
-
82
- require_paths:
95
+ require_paths:
83
96
  - lib
84
- required_ruby_version: !ruby/object:Gem::Requirement
97
+ required_ruby_version: !ruby/object:Gem::Requirement
85
98
  none: false
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: "0"
90
- required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
104
  none: false
92
- requirements:
93
- - - ">="
94
- - !ruby/object:Gem::Version
95
- version: "0"
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
96
109
  requirements: []
97
-
98
110
  rubyforge_project: similarity
99
- rubygems_version: 1.5.0
111
+ rubygems_version: 1.8.23
100
112
  signing_key:
101
113
  specification_version: 3
102
114
  summary: Document similarity calculations using cosine similarity and TF-IDF weights
103
115
  test_files: []
104
-