tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe TfIdfSimilarity::Token do
5
+ describe '#valid?' do
6
+ it 'should return false if all of its characters are numbers, punctuation or whitespace characters' do
7
+ TfIdfSimilarity::Token.new('1 2 3 ! @ #').valid?.should == false
8
+ end
9
+
10
+ it 'should return true if not all of its characters are numbers, punctuation or whitespace characters' do
11
+ TfIdfSimilarity::Token.new('1 2 3 ! @ # a').valid?.should == true
12
+ end
13
+ end
14
+
15
+ describe '#lowercase_filter' do
16
+ it 'should lowercase the token' do
17
+ TfIdfSimilarity::Token.new('HÉTÉROGÉNÉITÉ').lowercase_filter.should == 'hétérogénéité'
18
+ end
19
+ end
20
+
21
+ describe '#classic_filter' do
22
+ it 'should remove all periods' do
23
+ TfIdfSimilarity::Token.new('X.Y.Z.').classic_filter.should == 'XYZ'
24
+ end
25
+
26
+ it 'should remove ending possessives' do
27
+ TfIdfSimilarity::Token.new("foo's").classic_filter.should == 'foo'
28
+ end
29
+
30
+ it 'should not remove infix possessives' do
31
+ TfIdfSimilarity::Token.new("foo's bar").classic_filter.should == "foo's bar"
32
+ end
33
+ end
34
+ end
@@ -1,6 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "tf-idf-similarity/version"
2
+ require File.expand_path('../lib/tf-idf-similarity/version', __FILE__)
4
3
 
5
4
  Gem::Specification.new do |s|
6
5
  s.name = "tf-idf-similarity"
@@ -16,7 +15,8 @@ Gem::Specification.new do |s|
16
15
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
16
  s.require_paths = ["lib"]
18
17
 
19
- s.add_runtime_dependency('unicode_utils')
18
+ s.add_runtime_dependency('unicode_utils') unless RUBY_VERSION < '1.9'
20
19
  s.add_development_dependency('rspec', '~> 2.10')
21
20
  s.add_development_dependency('rake')
21
+ s.add_development_dependency('coveralls')
22
22
  end
metadata CHANGED
@@ -1,71 +1,76 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
- version: !ruby/object:Gem::Version
4
- version: 0.0.9
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Open North
9
14
  autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
- date: 2013-01-07 00:00:00.000000000 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: unicode_utils
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '0'
22
- type: :runtime
23
- prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
30
- - !ruby/object:Gem::Dependency
17
+
18
+ date: 2013-06-03 00:00:00 -04:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
31
22
  name: rspec
32
- requirement: !ruby/object:Gem::Requirement
33
- none: false
34
- requirements:
35
- - - ~>
36
- - !ruby/object:Gem::Version
37
- version: '2.10'
38
- type: :development
39
23
  prerelease: false
40
- version_requirements: !ruby/object:Gem::Requirement
24
+ requirement: &id001 !ruby/object:Gem::Requirement
41
25
  none: false
42
- requirements:
26
+ requirements:
43
27
  - - ~>
44
- - !ruby/object:Gem::Version
45
- version: '2.10'
46
- - !ruby/object:Gem::Dependency
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 2
32
+ - 10
33
+ version: "2.10"
34
+ type: :development
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
47
37
  name: rake
48
- requirement: !ruby/object:Gem::Requirement
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
49
40
  none: false
50
- requirements:
51
- - - ! '>='
52
- - !ruby/object:Gem::Version
53
- version: '0'
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
54
48
  type: :development
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: coveralls
55
52
  prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
53
+ requirement: &id003 !ruby/object:Gem::Requirement
57
54
  none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ hash: 3
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ type: :development
63
+ version_requirements: *id003
62
64
  description:
63
- email:
65
+ email:
64
66
  - info@opennorth.ca
65
67
  executables: []
68
+
66
69
  extensions: []
70
+
67
71
  extra_rdoc_files: []
68
- files:
72
+
73
+ files:
69
74
  - .gitignore
70
75
  - .travis.yml
71
76
  - .yardopts
@@ -75,36 +80,59 @@ files:
75
80
  - Rakefile
76
81
  - USAGE
77
82
  - lib/tf-idf-similarity.rb
78
- - lib/tf-idf-similarity/collection.rb
79
83
  - lib/tf-idf-similarity/document.rb
80
- - lib/tf-idf-similarity/extras/collection.rb
81
84
  - lib/tf-idf-similarity/extras/document.rb
85
+ - lib/tf-idf-similarity/extras/tf_idf_model.rb
86
+ - lib/tf-idf-similarity/matrix_methods.rb
87
+ - lib/tf-idf-similarity/term_count_model.rb
88
+ - lib/tf-idf-similarity/tf_idf_model.rb
82
89
  - lib/tf-idf-similarity/token.rb
83
90
  - lib/tf-idf-similarity/version.rb
91
+ - spec/document_spec.rb
92
+ - spec/extras/tf_idf_model_spec.rb
93
+ - spec/spec_helper.rb
94
+ - spec/term_count_model_spec.rb
95
+ - spec/tf_idf_model_spec.rb
96
+ - spec/token_spec.rb
84
97
  - td-idf-similarity.gemspec
98
+ has_rdoc: true
85
99
  homepage: http://github.com/opennorth/tf-idf-similarity
86
100
  licenses: []
101
+
87
102
  post_install_message:
88
103
  rdoc_options: []
89
- require_paths:
104
+
105
+ require_paths:
90
106
  - lib
91
- required_ruby_version: !ruby/object:Gem::Requirement
107
+ required_ruby_version: !ruby/object:Gem::Requirement
92
108
  none: false
93
- requirements:
94
- - - ! '>='
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
117
  none: false
99
- requirements:
100
- - - ! '>='
101
- - !ruby/object:Gem::Version
102
- version: '0'
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
124
+ version: "0"
103
125
  requirements: []
126
+
104
127
  rubyforge_project:
105
- rubygems_version: 1.8.24
128
+ rubygems_version: 1.6.2
106
129
  signing_key:
107
130
  specification_version: 3
108
131
  summary: Calculates the similarity between texts using tf*idf
109
- test_files: []
110
- has_rdoc:
132
+ test_files:
133
+ - spec/document_spec.rb
134
+ - spec/extras/tf_idf_model_spec.rb
135
+ - spec/spec_helper.rb
136
+ - spec/term_count_model_spec.rb
137
+ - spec/tf_idf_model_spec.rb
138
+ - spec/token_spec.rb
@@ -1,205 +0,0 @@
1
- # @todo Do speed comparison between these gsl and narray, to load fastest first.
2
- begin
3
- require 'gsl'
4
- rescue LoadError
5
- begin
6
- require 'narray'
7
- rescue LoadError
8
- require 'matrix'
9
- end
10
- end
11
-
12
- class TfIdfSimilarity::Collection
13
- class CollectionError < StandardError; end
14
-
15
- # The documents in the collection.
16
- attr_reader :documents
17
- # The number of times each term appears in all documents.
18
- attr_reader :term_counts
19
- # The number of documents each term appears in.
20
- attr_reader :document_counts
21
-
22
- def initialize
23
- @documents = []
24
- @term_counts = Hash.new 0
25
- @document_counts = Hash.new 0
26
- end
27
-
28
- def <<(document)
29
- document.term_counts.each do |term,count|
30
- @term_counts[term] += count
31
- @document_counts[term] += 1
32
- end
33
- @documents << document
34
- end
35
-
36
- # @return [Array<String>] the set of the collection's terms with no duplicates
37
- def terms
38
- term_counts.keys
39
- end
40
-
41
- # @param [Hash] opts optional arguments
42
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
43
- #
44
- # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
45
- # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
46
- # @see http://en.wikipedia.org/wiki/Vector_space_model
47
- # @see http://en.wikipedia.org/wiki/Document-term_matrix
48
- # @see http://en.wikipedia.org/wiki/Cosine_similarity
49
- # @see http://en.wikipedia.org/wiki/Okapi_BM25
50
- def similarity_matrix(opts = {})
51
- if documents.empty?
52
- raise CollectionError, "No documents in collection"
53
- end
54
-
55
- # Calculate tf*idf.
56
- if stdlib?
57
- idf = []
58
- matrix = Matrix.build(terms.size, documents.size) do |i,j|
59
- idf[i] ||= inverse_document_frequency(terms[i], opts)
60
- idf[i] * term_frequency(documents[j], terms[i], opts)
61
- end
62
- else
63
- matrix = initialize_matrix
64
- terms.each_with_index do |term,i|
65
- idf = inverse_document_frequency(term, opts)
66
- documents.each_with_index do |document,j|
67
- value = idf * term_frequency(document, term, opts)
68
- # NArray puts the dimensions in a different order.
69
- # @see http://narray.rubyforge.org/SPEC.en
70
- if narray?
71
- matrix[j, i] = value
72
- else
73
- matrix[i, j] = value
74
- end
75
- end
76
- end
77
- end
78
-
79
- # Columns are normalized to unit vectors, so we can calculate the cosine
80
- # similarity of all document vectors. BM25 doesn't normalize columns, but
81
- # BM25 wasn't written with this use case in mind.
82
- matrix = normalize matrix
83
-
84
- if nmatrix?
85
- matrix.transpose.dot matrix
86
- else
87
- matrix.transpose * matrix
88
- end
89
- end
90
-
91
- # @param [Document] document a document
92
- # @param [String] term a term
93
- # @param [Hash] opts optional arguments
94
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
95
- # @return [Float] the term's frequency in the document
96
- def term_frequency_inverse_document_frequency(document, term, opts = {})
97
- inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
98
- end
99
- alias_method :tfidf, :term_frequency_inverse_document_frequency
100
-
101
- # @param [String] term a term
102
- # @param [Hash] opts optional arguments
103
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
104
- # @return [Float] the term's inverse document frequency
105
- def inverse_document_frequency(term, opts = {})
106
- if opts[:function] == :bm25
107
- Math.log (documents.size - document_counts[term] + 0.5) / (document_counts[term] + 0.5)
108
- else
109
- 1 + Math.log(documents.size / (document_counts[term].to_f + 1))
110
- end
111
- end
112
- alias_method :idf, :inverse_document_frequency
113
-
114
- # @param [Document] document a document
115
- # @param [String] term a term
116
- # @param [Hash] opts optional arguments
117
- # @option opts [Symbol] :function one of :tfidf (default) or :bm25
118
- # @return [Float] the term's frequency in the document
119
- #
120
- # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
121
- def term_frequency(document, term, opts = {})
122
- if opts[:function] == :bm25
123
- (document.term_counts[term].to_i * 2.2) / (document.term_counts[term].to_i + 0.3 + 0.9 * document.size / average_document_size)
124
- else
125
- document.term_frequency term
126
- end
127
- end
128
- alias_method :tf, :term_frequency
129
-
130
- # @return [Float] the average document size, in terms
131
- def average_document_size
132
- if documents.empty?
133
- raise CollectionError, "No documents in collection"
134
- end
135
-
136
- @average_document_size ||= documents.map(&:size).reduce(:+) / documents.size.to_f
137
- end
138
-
139
- # Resets the average document size.
140
- #
141
- # If you have already made a similarity matrix and are adding more documents,
142
- # call this method before creating a new similarity matrix.
143
- def reset_average_document_size!
144
- @average_document_size = nil
145
- end
146
-
147
- # @param [Document] matrix a term-document matrix
148
- # @return [GSL::Matrix,NMatrix,Matrix] a matrix in which all document vectors are unit vectors
149
- #
150
- # @note Lucene normalizes document length differently.
151
- def normalize(matrix)
152
- if gsl?
153
- matrix.each_col(&:normalize!)
154
- elsif narray?
155
- # @see https://github.com/masa16/narray/issues/21
156
- NMatrix.refer(matrix / NMath.sqrt((matrix ** 2).sum(1).reshape(documents.size, 1)))
157
- elsif nmatrix?
158
- # @see https://github.com/SciRuby/nmatrix/issues/38
159
- (0...matrix.shape[1]).each do |j|
160
- # @see https://github.com/SciRuby/nmatrix/pull/46
161
- column = matrix.column(j)
162
- norm = Math.sqrt(column.transpose.dot(column)[0, 0])
163
- (0...m.shape[0]).each do |i|
164
- m[i, j] /= norm
165
- end
166
- end
167
- matrix.cast :yale, :float64
168
- else
169
- Matrix.columns matrix.column_vectors.map(&:normalize)
170
- end
171
- end
172
-
173
- private
174
-
175
- # @return a matrix
176
- def initialize_matrix
177
- if gsl?
178
- GSL::Matrix.alloc terms.size, documents.size
179
- elsif narray?
180
- NArray.float documents.size, terms.size
181
- elsif nmatrix?
182
- NMatrix.new(:list, [terms.size, documents.size], :float64)
183
- end
184
- end
185
-
186
- # @return [Boolean] whether to use the GSL gem
187
- def gsl?
188
- @gsl ||= Object.const_defined?(:GSL)
189
- end
190
-
191
- # @return [Boolean] whether to use the NArray gem
192
- def narray?
193
- @narray ||= Object.const_defined?(:NArray) && !gsl?
194
- end
195
-
196
- # @return [Boolean] whether to use the NMatrix gem
197
- def nmatrix?
198
- @nmatrix ||= Object.const_defined?(:NMatrix) && !gsl? && !narray?
199
- end
200
-
201
- # @return [Boolean] whether to use the standard library
202
- def stdlib?
203
- @matrix ||= Object.const_defined?(:Matrix)
204
- end
205
- end