tf-idf-similarity 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe TfIdfSimilarity::Token do
5
+ describe '#valid?' do
6
+ it 'should return false if all of its characters are numbers, punctuation or whitespace characters' do
7
+ TfIdfSimilarity::Token.new('1 2 3 ! @ #').valid?.should == false
8
+ end
9
+
10
+ it 'should return true if not all of its characters are numbers, punctuation or whitespace characters' do
11
+ TfIdfSimilarity::Token.new('1 2 3 ! @ # a').valid?.should == true
12
+ end
13
+ end
14
+
15
+ describe '#lowercase_filter' do
16
+ it 'should lowercase the token' do
17
+ TfIdfSimilarity::Token.new('HÉTÉROGÉNÉITÉ').lowercase_filter.should == 'hétérogénéité'
18
+ end
19
+ end
20
+
21
+ describe '#classic_filter' do
22
+ it 'should remove all periods' do
23
+ TfIdfSimilarity::Token.new('X.Y.Z.').classic_filter.should == 'XYZ'
24
+ end
25
+
26
+ it 'should remove ending possessives' do
27
+ TfIdfSimilarity::Token.new("foo's").classic_filter.should == 'foo'
28
+ end
29
+
30
+ it 'should not remove infix possessives' do
31
+ TfIdfSimilarity::Token.new("foo's bar").classic_filter.should == "foo's bar"
32
+ end
33
+ end
34
+ end
@@ -1,6 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "tf-idf-similarity/version"
2
+ require File.expand_path('../lib/tf-idf-similarity/version', __FILE__)
4
3
 
5
4
  Gem::Specification.new do |s|
6
5
  s.name = "tf-idf-similarity"
@@ -16,7 +15,8 @@ Gem::Specification.new do |s|
16
15
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
16
  s.require_paths = ["lib"]
18
17
 
19
- s.add_runtime_dependency('unicode_utils')
18
+ s.add_runtime_dependency('unicode_utils') unless RUBY_VERSION < '1.9'
20
19
  s.add_development_dependency('rspec', '~> 2.10')
21
20
  s.add_development_dependency('rake')
21
+ s.add_development_dependency('coveralls')
22
22
  end
metadata CHANGED
@@ -1,71 +1,76 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
- version: !ruby/object:Gem::Version
4
- version: 0.0.9
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Open North
9
14
  autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
- date: 2013-01-07 00:00:00.000000000 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: unicode_utils
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '0'
22
- type: :runtime
23
- prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
30
- - !ruby/object:Gem::Dependency
17
+
18
+ date: 2013-06-03 00:00:00 -04:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
31
22
  name: rspec
32
- requirement: !ruby/object:Gem::Requirement
33
- none: false
34
- requirements:
35
- - - ~>
36
- - !ruby/object:Gem::Version
37
- version: '2.10'
38
- type: :development
39
23
  prerelease: false
40
- version_requirements: !ruby/object:Gem::Requirement
24
+ requirement: &id001 !ruby/object:Gem::Requirement
41
25
  none: false
42
- requirements:
26
+ requirements:
43
27
  - - ~>
44
- - !ruby/object:Gem::Version
45
- version: '2.10'
46
- - !ruby/object:Gem::Dependency
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 2
32
+ - 10
33
+ version: "2.10"
34
+ type: :development
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
47
37
  name: rake
48
- requirement: !ruby/object:Gem::Requirement
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
49
40
  none: false
50
- requirements:
51
- - - ! '>='
52
- - !ruby/object:Gem::Version
53
- version: '0'
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
54
48
  type: :development
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: coveralls
55
52
  prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
53
+ requirement: &id003 !ruby/object:Gem::Requirement
57
54
  none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ hash: 3
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ type: :development
63
+ version_requirements: *id003
62
64
  description:
63
- email:
65
+ email:
64
66
  - info@opennorth.ca
65
67
  executables: []
68
+
66
69
  extensions: []
70
+
67
71
  extra_rdoc_files: []
68
- files:
72
+
73
+ files:
69
74
  - .gitignore
70
75
  - .travis.yml
71
76
  - .yardopts
@@ -75,36 +80,59 @@ files:
75
80
  - Rakefile
76
81
  - USAGE
77
82
  - lib/tf-idf-similarity.rb
78
- - lib/tf-idf-similarity/collection.rb
79
83
  - lib/tf-idf-similarity/document.rb
80
- - lib/tf-idf-similarity/extras/collection.rb
81
84
  - lib/tf-idf-similarity/extras/document.rb
85
+ - lib/tf-idf-similarity/extras/tf_idf_model.rb
86
+ - lib/tf-idf-similarity/matrix_methods.rb
87
+ - lib/tf-idf-similarity/term_count_model.rb
88
+ - lib/tf-idf-similarity/tf_idf_model.rb
82
89
  - lib/tf-idf-similarity/token.rb
83
90
  - lib/tf-idf-similarity/version.rb
91
+ - spec/document_spec.rb
92
+ - spec/extras/tf_idf_model_spec.rb
93
+ - spec/spec_helper.rb
94
+ - spec/term_count_model_spec.rb
95
+ - spec/tf_idf_model_spec.rb
96
+ - spec/token_spec.rb
84
97
  - td-idf-similarity.gemspec
98
+ has_rdoc: true
85
99
  homepage: http://github.com/opennorth/tf-idf-similarity
86
100
  licenses: []
101
+
87
102
  post_install_message:
88
103
  rdoc_options: []
89
- require_paths:
104
+
105
+ require_paths:
90
106
  - lib
91
- required_ruby_version: !ruby/object:Gem::Requirement
107
+ required_ruby_version: !ruby/object:Gem::Requirement
92
108
  none: false
93
- requirements:
94
- - - ! '>='
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
117
  none: false
99
- requirements:
100
- - - ! '>='
101
- - !ruby/object:Gem::Version
102
- version: '0'
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
124
+ version: "0"
103
125
  requirements: []
126
+
104
127
  rubyforge_project:
105
- rubygems_version: 1.8.24
128
+ rubygems_version: 1.6.2
106
129
  signing_key:
107
130
  specification_version: 3
108
131
  summary: Calculates the similarity between texts using tf*idf
109
- test_files: []
110
- has_rdoc:
132
+ test_files:
133
+ - spec/document_spec.rb
134
+ - spec/extras/tf_idf_model_spec.rb
135
+ - spec/spec_helper.rb
136
+ - spec/term_count_model_spec.rb
137
+ - spec/tf_idf_model_spec.rb
138
+ - spec/token_spec.rb
@@ -1,205 +0,0 @@
1
- # @todo Do speed comparison between these gsl and narray, to load fastest first.
2
- begin
3
- require 'gsl'
4
- rescue LoadError
5
- begin
6
- require 'narray'
7
- rescue LoadError
8
- require 'matrix'
9
- end
10
- end
11
-
12
- class TfIdfSimilarity::Collection
13
- class CollectionError < StandardError; end
14
-
15
- # The documents in the collection.
16
- attr_reader :documents
17
- # The number of times each term appears in all documents.
18
- attr_reader :term_counts
19
- # The number of documents each term appears in.
20
- attr_reader :document_counts
21
-
22
- def initialize
23
- @documents = []
24
- @term_counts = Hash.new 0
25
- @document_counts = Hash.new 0
26
- end
27
-
28
- def <<(document)
29
- document.term_counts.each do |term,count|
30
- @term_counts[term] += count
31
- @document_counts[term] += 1
32
- end
33
- @documents << document
34
- end
35
-
36
- # @return [Array<String>] the set of the collection's terms with no duplicates
37
- def terms
38
- term_counts.keys
39
- end
40
-
41
- # @param [Hash] opts optional arguments
42
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
43
- #
44
- # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
45
- # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
46
- # @see http://en.wikipedia.org/wiki/Vector_space_model
47
- # @see http://en.wikipedia.org/wiki/Document-term_matrix
48
- # @see http://en.wikipedia.org/wiki/Cosine_similarity
49
- # @see http://en.wikipedia.org/wiki/Okapi_BM25
50
- def similarity_matrix(opts = {})
51
- if documents.empty?
52
- raise CollectionError, "No documents in collection"
53
- end
54
-
55
- # Calculate tf*idf.
56
- if stdlib?
57
- idf = []
58
- matrix = Matrix.build(terms.size, documents.size) do |i,j|
59
- idf[i] ||= inverse_document_frequency(terms[i], opts)
60
- idf[i] * term_frequency(documents[j], terms[i], opts)
61
- end
62
- else
63
- matrix = initialize_matrix
64
- terms.each_with_index do |term,i|
65
- idf = inverse_document_frequency(term, opts)
66
- documents.each_with_index do |document,j|
67
- value = idf * term_frequency(document, term, opts)
68
- # NArray puts the dimensions in a different order.
69
- # @see http://narray.rubyforge.org/SPEC.en
70
- if narray?
71
- matrix[j, i] = value
72
- else
73
- matrix[i, j] = value
74
- end
75
- end
76
- end
77
- end
78
-
79
- # Columns are normalized to unit vectors, so we can calculate the cosine
80
- # similarity of all document vectors. BM25 doesn't normalize columns, but
81
- # BM25 wasn't written with this use case in mind.
82
- matrix = normalize matrix
83
-
84
- if nmatrix?
85
- matrix.transpose.dot matrix
86
- else
87
- matrix.transpose * matrix
88
- end
89
- end
90
-
91
- # @param [Document] document a document
92
- # @param [String] term a term
93
- # @param [Hash] opts optional arguments
94
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
95
- # @return [Float] the term's frequency in the document
96
- def term_frequency_inverse_document_frequency(document, term, opts = {})
97
- inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
98
- end
99
- alias_method :tfidf, :term_frequency_inverse_document_frequency
100
-
101
- # @param [String] term a term
102
- # @param [Hash] opts optional arguments
103
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
104
- # @return [Float] the term's inverse document frequency
105
- def inverse_document_frequency(term, opts = {})
106
- if opts[:function] == :bm25
107
- Math.log (documents.size - document_counts[term] + 0.5) / (document_counts[term] + 0.5)
108
- else
109
- 1 + Math.log(documents.size / (document_counts[term].to_f + 1))
110
- end
111
- end
112
- alias_method :idf, :inverse_document_frequency
113
-
114
- # @param [Document] document a document
115
- # @param [String] term a term
116
- # @param [Hash] opts optional arguments
117
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
118
- # @return [Float] the term's frequency in the document
119
- #
120
- # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
121
- def term_frequency(document, term, opts = {})
122
- if opts[:function] == :bm25
123
- (document.term_counts[term].to_i * 2.2) / (document.term_counts[term].to_i + 0.3 + 0.9 * document.size / average_document_size)
124
- else
125
- document.term_frequency term
126
- end
127
- end
128
- alias_method :tf, :term_frequency
129
-
130
- # @return [Float] the average document size, in terms
131
- def average_document_size
132
- if documents.empty?
133
- raise CollectionError, "No documents in collection"
134
- end
135
-
136
- @average_document_size ||= documents.map(&:size).reduce(:+) / documents.size.to_f
137
- end
138
-
139
- # Resets the average document size.
140
- #
141
- # If you have already made a similarity matrix and are adding more documents,
142
- # call this method before creating a new similarity matrix.
143
- def reset_average_document_size!
144
- @average_document_size = nil
145
- end
146
-
147
- # @param [Document] matrix a term-document matrix
148
- # @return [GSL::Matrix,NMatrix,Matrix] a matrix in which all document vectors are unit vectors
149
- #
150
- # @note Lucene normalizes document length differently.
151
- def normalize(matrix)
152
- if gsl?
153
- matrix.each_col(&:normalize!)
154
- elsif narray?
155
- # @see https://github.com/masa16/narray/issues/21
156
- NMatrix.refer(matrix / NMath.sqrt((matrix ** 2).sum(1).reshape(documents.size, 1)))
157
- elsif nmatrix?
158
- # @see https://github.com/SciRuby/nmatrix/issues/38
159
- (0...matrix.shape[1]).each do |j|
160
- # @see https://github.com/SciRuby/nmatrix/pull/46
161
- column = matrix.column(j)
162
- norm = Math.sqrt(column.transpose.dot(column)[0, 0])
163
- (0...m.shape[0]).each do |i|
164
- m[i, j] /= norm
165
- end
166
- end
167
- matrix.cast :yale, :float64
168
- else
169
- Matrix.columns matrix.column_vectors.map(&:normalize)
170
- end
171
- end
172
-
173
- private
174
-
175
- # @return a matrix
176
- def initialize_matrix
177
- if gsl?
178
- GSL::Matrix.alloc terms.size, documents.size
179
- elsif narray?
180
- NArray.float documents.size, terms.size
181
- elsif nmatrix?
182
- NMatrix.new(:list, [terms.size, documents.size], :float64)
183
- end
184
- end
185
-
186
- # @return [Boolean] whether to use the GSL gem
187
- def gsl?
188
- @gsl ||= Object.const_defined?(:GSL)
189
- end
190
-
191
- # @return [Boolean] whether to use the NArray gem
192
- def narray?
193
- @narray ||= Object.const_defined?(:NArray) && !gsl?
194
- end
195
-
196
- # @return [Boolean] whether to use the NMatrix gem
197
- def nmatrix?
198
- @nmatrix ||= Object.const_defined?(:NMatrix) && !gsl? && !narray?
199
- end
200
-
201
- # @return [Boolean] whether to use the standard library
202
- def stdlib?
203
- @matrix ||= Object.const_defined?(:Matrix)
204
- end
205
- end