tf-idf-similarity 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
data/spec/token_spec.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe TfIdfSimilarity::Token do
|
5
|
+
describe '#valid?' do
|
6
|
+
it 'should return false if all of its characters are numbers, punctuation or whitespace characters' do
|
7
|
+
TfIdfSimilarity::Token.new('1 2 3 ! @ #').valid?.should == false
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should return true if not all of its characters are numbers, punctuation or whitespace characters' do
|
11
|
+
TfIdfSimilarity::Token.new('1 2 3 ! @ # a').valid?.should == true
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '#lowercase_filter' do
|
16
|
+
it 'should lowercase the token' do
|
17
|
+
TfIdfSimilarity::Token.new('HÉTÉROGÉNÉITÉ').lowercase_filter.should == 'hétérogénéité'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '#classic_filter' do
|
22
|
+
it 'should remove all periods' do
|
23
|
+
TfIdfSimilarity::Token.new('X.Y.Z.').classic_filter.should == 'XYZ'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should remove ending possessives' do
|
27
|
+
TfIdfSimilarity::Token.new("foo's").classic_filter.should == 'foo'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should not remove infix possessives' do
|
31
|
+
TfIdfSimilarity::Token.new("foo's bar").classic_filter.should == "foo's bar"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/td-idf-similarity.gemspec
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require "tf-idf-similarity/version"
|
2
|
+
require File.expand_path('../lib/tf-idf-similarity/version', __FILE__)
|
4
3
|
|
5
4
|
Gem::Specification.new do |s|
|
6
5
|
s.name = "tf-idf-similarity"
|
@@ -16,7 +15,8 @@ Gem::Specification.new do |s|
|
|
16
15
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
16
|
s.require_paths = ["lib"]
|
18
17
|
|
19
|
-
s.add_runtime_dependency('unicode_utils')
|
18
|
+
s.add_runtime_dependency('unicode_utils') unless RUBY_VERSION < '1.9'
|
20
19
|
s.add_development_dependency('rspec', '~> 2.10')
|
21
20
|
s.add_development_dependency('rake')
|
21
|
+
s.add_development_dependency('coveralls')
|
22
22
|
end
|
metadata
CHANGED
@@ -1,71 +1,76 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Open North
|
9
14
|
autorequire:
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ! '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
22
|
-
type: :runtime
|
23
|
-
prerelease: false
|
24
|
-
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ! '>='
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
30
|
-
- !ruby/object:Gem::Dependency
|
17
|
+
|
18
|
+
date: 2013-06-03 00:00:00 -04:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
31
22
|
name: rspec
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
|
-
requirements:
|
35
|
-
- - ~>
|
36
|
-
- !ruby/object:Gem::Version
|
37
|
-
version: '2.10'
|
38
|
-
type: :development
|
39
23
|
prerelease: false
|
40
|
-
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
41
25
|
none: false
|
42
|
-
requirements:
|
26
|
+
requirements:
|
43
27
|
- - ~>
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
|
46
|
-
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 2
|
32
|
+
- 10
|
33
|
+
version: "2.10"
|
34
|
+
type: :development
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
47
37
|
name: rake
|
48
|
-
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
49
40
|
none: false
|
50
|
-
requirements:
|
51
|
-
- -
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
54
48
|
type: :development
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: coveralls
|
55
52
|
prerelease: false
|
56
|
-
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
54
|
none: false
|
58
|
-
requirements:
|
59
|
-
- -
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 3
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
type: :development
|
63
|
+
version_requirements: *id003
|
62
64
|
description:
|
63
|
-
email:
|
65
|
+
email:
|
64
66
|
- info@opennorth.ca
|
65
67
|
executables: []
|
68
|
+
|
66
69
|
extensions: []
|
70
|
+
|
67
71
|
extra_rdoc_files: []
|
68
|
-
|
72
|
+
|
73
|
+
files:
|
69
74
|
- .gitignore
|
70
75
|
- .travis.yml
|
71
76
|
- .yardopts
|
@@ -75,36 +80,59 @@ files:
|
|
75
80
|
- Rakefile
|
76
81
|
- USAGE
|
77
82
|
- lib/tf-idf-similarity.rb
|
78
|
-
- lib/tf-idf-similarity/collection.rb
|
79
83
|
- lib/tf-idf-similarity/document.rb
|
80
|
-
- lib/tf-idf-similarity/extras/collection.rb
|
81
84
|
- lib/tf-idf-similarity/extras/document.rb
|
85
|
+
- lib/tf-idf-similarity/extras/tf_idf_model.rb
|
86
|
+
- lib/tf-idf-similarity/matrix_methods.rb
|
87
|
+
- lib/tf-idf-similarity/term_count_model.rb
|
88
|
+
- lib/tf-idf-similarity/tf_idf_model.rb
|
82
89
|
- lib/tf-idf-similarity/token.rb
|
83
90
|
- lib/tf-idf-similarity/version.rb
|
91
|
+
- spec/document_spec.rb
|
92
|
+
- spec/extras/tf_idf_model_spec.rb
|
93
|
+
- spec/spec_helper.rb
|
94
|
+
- spec/term_count_model_spec.rb
|
95
|
+
- spec/tf_idf_model_spec.rb
|
96
|
+
- spec/token_spec.rb
|
84
97
|
- td-idf-similarity.gemspec
|
98
|
+
has_rdoc: true
|
85
99
|
homepage: http://github.com/opennorth/tf-idf-similarity
|
86
100
|
licenses: []
|
101
|
+
|
87
102
|
post_install_message:
|
88
103
|
rdoc_options: []
|
89
|
-
|
104
|
+
|
105
|
+
require_paths:
|
90
106
|
- lib
|
91
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
108
|
none: false
|
93
|
-
requirements:
|
94
|
-
- -
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
|
97
|
-
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
hash: 3
|
113
|
+
segments:
|
114
|
+
- 0
|
115
|
+
version: "0"
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
117
|
none: false
|
99
|
-
requirements:
|
100
|
-
- -
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
version: "0"
|
103
125
|
requirements: []
|
126
|
+
|
104
127
|
rubyforge_project:
|
105
|
-
rubygems_version: 1.
|
128
|
+
rubygems_version: 1.6.2
|
106
129
|
signing_key:
|
107
130
|
specification_version: 3
|
108
131
|
summary: Calculates the similarity between texts using tf*idf
|
109
|
-
test_files:
|
110
|
-
|
132
|
+
test_files:
|
133
|
+
- spec/document_spec.rb
|
134
|
+
- spec/extras/tf_idf_model_spec.rb
|
135
|
+
- spec/spec_helper.rb
|
136
|
+
- spec/term_count_model_spec.rb
|
137
|
+
- spec/tf_idf_model_spec.rb
|
138
|
+
- spec/token_spec.rb
|
@@ -1,205 +0,0 @@
|
|
1
|
-
# @todo Do speed comparison between these gsl and narray, to load fastest first.
|
2
|
-
begin
|
3
|
-
require 'gsl'
|
4
|
-
rescue LoadError
|
5
|
-
begin
|
6
|
-
require 'narray'
|
7
|
-
rescue LoadError
|
8
|
-
require 'matrix'
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
class TfIdfSimilarity::Collection
|
13
|
-
class CollectionError < StandardError; end
|
14
|
-
|
15
|
-
# The documents in the collection.
|
16
|
-
attr_reader :documents
|
17
|
-
# The number of times each term appears in all documents.
|
18
|
-
attr_reader :term_counts
|
19
|
-
# The number of documents each term appears in.
|
20
|
-
attr_reader :document_counts
|
21
|
-
|
22
|
-
def initialize
|
23
|
-
@documents = []
|
24
|
-
@term_counts = Hash.new 0
|
25
|
-
@document_counts = Hash.new 0
|
26
|
-
end
|
27
|
-
|
28
|
-
def <<(document)
|
29
|
-
document.term_counts.each do |term,count|
|
30
|
-
@term_counts[term] += count
|
31
|
-
@document_counts[term] += 1
|
32
|
-
end
|
33
|
-
@documents << document
|
34
|
-
end
|
35
|
-
|
36
|
-
# @return [Array<String>] the set of the collection's terms with no duplicates
|
37
|
-
def terms
|
38
|
-
term_counts.keys
|
39
|
-
end
|
40
|
-
|
41
|
-
# @param [Hash] opts optional arguments
|
42
|
-
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
43
|
-
#
|
44
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
45
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
46
|
-
# @see http://en.wikipedia.org/wiki/Vector_space_model
|
47
|
-
# @see http://en.wikipedia.org/wiki/Document-term_matrix
|
48
|
-
# @see http://en.wikipedia.org/wiki/Cosine_similarity
|
49
|
-
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
50
|
-
def similarity_matrix(opts = {})
|
51
|
-
if documents.empty?
|
52
|
-
raise CollectionError, "No documents in collection"
|
53
|
-
end
|
54
|
-
|
55
|
-
# Calculate tf*idf.
|
56
|
-
if stdlib?
|
57
|
-
idf = []
|
58
|
-
matrix = Matrix.build(terms.size, documents.size) do |i,j|
|
59
|
-
idf[i] ||= inverse_document_frequency(terms[i], opts)
|
60
|
-
idf[i] * term_frequency(documents[j], terms[i], opts)
|
61
|
-
end
|
62
|
-
else
|
63
|
-
matrix = initialize_matrix
|
64
|
-
terms.each_with_index do |term,i|
|
65
|
-
idf = inverse_document_frequency(term, opts)
|
66
|
-
documents.each_with_index do |document,j|
|
67
|
-
value = idf * term_frequency(document, term, opts)
|
68
|
-
# NArray puts the dimensions in a different order.
|
69
|
-
# @see http://narray.rubyforge.org/SPEC.en
|
70
|
-
if narray?
|
71
|
-
matrix[j, i] = value
|
72
|
-
else
|
73
|
-
matrix[i, j] = value
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# Columns are normalized to unit vectors, so we can calculate the cosine
|
80
|
-
# similarity of all document vectors. BM25 doesn't normalize columns, but
|
81
|
-
# BM25 wasn't written with this use case in mind.
|
82
|
-
matrix = normalize matrix
|
83
|
-
|
84
|
-
if nmatrix?
|
85
|
-
matrix.transpose.dot matrix
|
86
|
-
else
|
87
|
-
matrix.transpose * matrix
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
# @param [Document] document a document
|
92
|
-
# @param [String] term a term
|
93
|
-
# @param [Hash] opts optional arguments
|
94
|
-
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
95
|
-
# @return [Float] the term's frequency in the document
|
96
|
-
def term_frequency_inverse_document_frequency(document, term, opts = {})
|
97
|
-
inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
|
98
|
-
end
|
99
|
-
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
100
|
-
|
101
|
-
# @param [String] term a term
|
102
|
-
# @param [Hash] opts optional arguments
|
103
|
-
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
104
|
-
# @return [Float] the term's inverse document frequency
|
105
|
-
def inverse_document_frequency(term, opts = {})
|
106
|
-
if opts[:function] == :bm25
|
107
|
-
Math.log (documents.size - document_counts[term] + 0.5) / (document_counts[term] + 0.5)
|
108
|
-
else
|
109
|
-
1 + Math.log(documents.size / (document_counts[term].to_f + 1))
|
110
|
-
end
|
111
|
-
end
|
112
|
-
alias_method :idf, :inverse_document_frequency
|
113
|
-
|
114
|
-
# @param [Document] document a document
|
115
|
-
# @param [String] term a term
|
116
|
-
# @param [Hash] opts optional arguments
|
117
|
-
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
118
|
-
# @return [Float] the term's frequency in the document
|
119
|
-
#
|
120
|
-
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
121
|
-
def term_frequency(document, term, opts = {})
|
122
|
-
if opts[:function] == :bm25
|
123
|
-
(document.term_counts[term].to_i * 2.2) / (document.term_counts[term].to_i + 0.3 + 0.9 * document.size / average_document_size)
|
124
|
-
else
|
125
|
-
document.term_frequency term
|
126
|
-
end
|
127
|
-
end
|
128
|
-
alias_method :tf, :term_frequency
|
129
|
-
|
130
|
-
# @return [Float] the average document size, in terms
|
131
|
-
def average_document_size
|
132
|
-
if documents.empty?
|
133
|
-
raise CollectionError, "No documents in collection"
|
134
|
-
end
|
135
|
-
|
136
|
-
@average_document_size ||= documents.map(&:size).reduce(:+) / documents.size.to_f
|
137
|
-
end
|
138
|
-
|
139
|
-
# Resets the average document size.
|
140
|
-
#
|
141
|
-
# If you have already made a similarity matrix and are adding more documents,
|
142
|
-
# call this method before creating a new similarity matrix.
|
143
|
-
def reset_average_document_size!
|
144
|
-
@average_document_size = nil
|
145
|
-
end
|
146
|
-
|
147
|
-
# @param [Document] matrix a term-document matrix
|
148
|
-
# @return [GSL::Matrix,NMatrix,Matrix] a matrix in which all document vectors are unit vectors
|
149
|
-
#
|
150
|
-
# @note Lucene normalizes document length differently.
|
151
|
-
def normalize(matrix)
|
152
|
-
if gsl?
|
153
|
-
matrix.each_col(&:normalize!)
|
154
|
-
elsif narray?
|
155
|
-
# @see https://github.com/masa16/narray/issues/21
|
156
|
-
NMatrix.refer(matrix / NMath.sqrt((matrix ** 2).sum(1).reshape(documents.size, 1)))
|
157
|
-
elsif nmatrix?
|
158
|
-
# @see https://github.com/SciRuby/nmatrix/issues/38
|
159
|
-
(0...matrix.shape[1]).each do |j|
|
160
|
-
# @see https://github.com/SciRuby/nmatrix/pull/46
|
161
|
-
column = matrix.column(j)
|
162
|
-
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
163
|
-
(0...m.shape[0]).each do |i|
|
164
|
-
m[i, j] /= norm
|
165
|
-
end
|
166
|
-
end
|
167
|
-
matrix.cast :yale, :float64
|
168
|
-
else
|
169
|
-
Matrix.columns matrix.column_vectors.map(&:normalize)
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
private
|
174
|
-
|
175
|
-
# @return a matrix
|
176
|
-
def initialize_matrix
|
177
|
-
if gsl?
|
178
|
-
GSL::Matrix.alloc terms.size, documents.size
|
179
|
-
elsif narray?
|
180
|
-
NArray.float documents.size, terms.size
|
181
|
-
elsif nmatrix?
|
182
|
-
NMatrix.new(:list, [terms.size, documents.size], :float64)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
# @return [Boolean] whether to use the GSL gem
|
187
|
-
def gsl?
|
188
|
-
@gsl ||= Object.const_defined?(:GSL)
|
189
|
-
end
|
190
|
-
|
191
|
-
# @return [Boolean] whether to use the NArray gem
|
192
|
-
def narray?
|
193
|
-
@narray ||= Object.const_defined?(:NArray) && !gsl?
|
194
|
-
end
|
195
|
-
|
196
|
-
# @return [Boolean] whether to use the NMatrix gem
|
197
|
-
def nmatrix?
|
198
|
-
@nmatrix ||= Object.const_defined?(:NMatrix) && !gsl? && !narray?
|
199
|
-
end
|
200
|
-
|
201
|
-
# @return [Boolean] whether to use the standard library
|
202
|
-
def stdlib?
|
203
|
-
@matrix ||= Object.const_defined?(:Matrix)
|
204
|
-
end
|
205
|
-
end
|