tfidf 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +8 -0
- data/examples/demo_tf.rb +8 -2
- data/lib/tfidf.rb +50 -26
- data/test/test_TFIDF.rb +16 -3
- metadata +10 -4
data/Rakefile
ADDED
data/examples/demo_tf.rb
CHANGED
@@ -127,10 +127,10 @@ puts "Terms in the corpus"
|
|
127
127
|
tfidf.terms.each {|k,v| puts "Term ID: #{k} => term: #{v}"}
|
128
128
|
|
129
129
|
puts "Document-Term Matrix, sparse List of lists(LIL)"
|
130
|
-
tfidf.
|
130
|
+
tfidf.dtm.each {|e| puts e}
|
131
131
|
|
132
132
|
puts "Term-Document Matrix, sparse LIL"
|
133
|
-
tfidf.
|
133
|
+
tfidf.tdm.each {|e| puts e}
|
134
134
|
|
135
135
|
puts "Term Frequency of word: video in document"
|
136
136
|
puts tfidf.tf
|
@@ -140,3 +140,9 @@ puts tfidf.idf("octocat")
|
|
140
140
|
|
141
141
|
puts "TF-IDF of word : octocats in document 7e38fa195cee92d2e7d834095d6938a89b5fdd58"
|
142
142
|
puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
|
143
|
+
|
144
|
+
puts "DTM in dense matrix"
|
145
|
+
puts tfidf.dense
|
146
|
+
|
147
|
+
|
148
|
+
|
data/lib/tfidf.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'fast_stemmer'
|
3
3
|
require 'digest'
|
4
|
+
require 'active_support'
|
5
|
+
require 'gsl'
|
4
6
|
|
5
7
|
|
6
8
|
class TFIDF
|
@@ -9,7 +11,7 @@ class TFIDF
|
|
9
11
|
@@split_pattern = /[\W]/
|
10
12
|
|
11
13
|
#Hash function used for generating id for documents as well as terms
|
12
|
-
def hash_func(obj)
|
14
|
+
def self.hash_func(obj)
|
13
15
|
return Digest::SHA1.hexdigest obj
|
14
16
|
end
|
15
17
|
|
@@ -26,9 +28,8 @@ class TFIDF
|
|
26
28
|
def initialize(corpus)
|
27
29
|
@cardinality = 0
|
28
30
|
@docs = {}
|
29
|
-
@
|
30
|
-
@
|
31
|
-
@sparse_matrix_doc_idx = {}
|
31
|
+
@tdm = {}
|
32
|
+
@dtm = {}
|
32
33
|
@idf = {}
|
33
34
|
|
34
35
|
#not in use
|
@@ -41,33 +42,34 @@ class TFIDF
|
|
41
42
|
else
|
42
43
|
@cardinality = corpus.length
|
43
44
|
end
|
44
|
-
memo = corpus.reduce({:terms =>
|
45
|
-
doc_id = hash_func doc
|
45
|
+
memo = corpus.reduce({:terms => SortedSet.new, :docs => ActiveSupport::OrderedHash.new({}), :dtm => {}, :tdm => {}}) do |memo, doc|
|
46
|
+
doc_id = TFIDF.hash_func doc
|
46
47
|
memo[:docs][doc_id] = doc
|
47
48
|
tf_single_doc = TFIDF.tf_single(doc)
|
48
|
-
memo[:
|
49
|
+
memo[:dtm][doc_id] = tf_single_doc
|
49
50
|
tf_single_doc.each do |keyvalue|
|
50
51
|
term, freq = keyvalue
|
51
|
-
term_id = hash_func term
|
52
|
-
|
53
|
-
if !x.has_key?(term_id)
|
54
|
-
x[term_id] = term
|
55
|
-
end}.call memo[:terms]
|
52
|
+
# term_id = TFIDF.hash_func term
|
53
|
+
memo[:terms].add term
|
56
54
|
lambda {|x|
|
57
55
|
if x[term] != nil
|
58
56
|
x[term][doc_id] = freq
|
59
57
|
else
|
60
58
|
x[term] = {doc_id => freq}
|
61
59
|
end
|
62
|
-
}.call memo[:
|
60
|
+
}.call memo[:tdm]
|
63
61
|
end
|
64
62
|
memo
|
65
63
|
end
|
66
64
|
@docs = memo[:docs]
|
67
|
-
@terms =
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
@terms = ActiveSupport::OrderedHash.new({})
|
66
|
+
memo[:terms].each do |term|
|
67
|
+
term_id = TFIDF.hash_func term
|
68
|
+
@terms[term_id] = term
|
69
|
+
end
|
70
|
+
@tdm = memo[:tdm]
|
71
|
+
@dtm = memo[:dtm]
|
72
|
+
@tdm.each {|k, v|
|
71
73
|
@idf[k] = TFIDF.idf(v.size, @cardinality)}
|
72
74
|
end
|
73
75
|
|
@@ -112,14 +114,14 @@ class TFIDF
|
|
112
114
|
return @terms
|
113
115
|
end
|
114
116
|
|
115
|
-
#
|
116
|
-
def
|
117
|
-
return @
|
117
|
+
#Document Term Matrix, in sparse List of lists(LIL)
|
118
|
+
def dtm
|
119
|
+
return @dtm
|
118
120
|
end
|
119
121
|
|
120
|
-
#
|
121
|
-
def
|
122
|
-
return @
|
122
|
+
#Term Document Matrix, in sparse List of lists(LIL)
|
123
|
+
def tdm
|
124
|
+
return @tdm
|
123
125
|
end
|
124
126
|
|
125
127
|
#=Arguments
|
@@ -143,11 +145,11 @@ class TFIDF
|
|
143
145
|
# Everything
|
144
146
|
def tf(term=nil, doc=nil)
|
145
147
|
if term == nil || doc == nil
|
146
|
-
return @
|
148
|
+
return @dtm
|
147
149
|
elsif term == nil
|
148
|
-
return @
|
150
|
+
return @dtm[doc]
|
149
151
|
else
|
150
|
-
return lambda {|x| (x == nil)?0:x}.call(@
|
152
|
+
return lambda {|x| (x == nil)?0:x}.call(@dtm[doc][term])
|
151
153
|
end
|
152
154
|
end
|
153
155
|
|
@@ -167,6 +169,28 @@ class TFIDF
|
|
167
169
|
def tfidf(term, doc)
|
168
170
|
return tf(term,doc) * idf(term)
|
169
171
|
end
|
172
|
+
|
173
|
+
#Access, or calculate if not present, a dense DTM in GSL::Matrix
|
174
|
+
#Each row corresponds to a document, each column a term
|
175
|
+
#Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
|
176
|
+
def dense()
|
177
|
+
return @dense_matrix unless @dense_matrix.nil?
|
178
|
+
dense_matrix = GSL::Matrix.alloc(@docs.size, @terms.size)
|
179
|
+
(0...@docs.size).each do |i|
|
180
|
+
doc_id = @docs.keys[i]
|
181
|
+
@dtm[doc_id].each do |term,freq|
|
182
|
+
idx = @terms.values.index term
|
183
|
+
dense_matrix.set([i,idx], freq)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
@dense_matrix = dense_matrix
|
187
|
+
return @dense_matrix
|
188
|
+
end
|
189
|
+
|
190
|
+
#TODO: Merge 2 TFIDF objects
|
191
|
+
def self.merge()
|
192
|
+
|
193
|
+
end
|
170
194
|
|
171
195
|
#Simply the formula for tf*idf
|
172
196
|
def self.idf(x,cardinality)
|
data/test/test_TFIDF.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'test/unit'
|
3
3
|
require 'tfidf'
|
4
|
-
|
4
|
+
|
5
5
|
require 'ruby-debug'
|
6
6
|
Debugger.start(:post_mortem => true)
|
7
7
|
Debugger.settings[:autoeval] = true
|
8
|
-
|
8
|
+
|
9
9
|
|
10
10
|
class TFIDFTest < Test::Unit::TestCase
|
11
11
|
def setup
|
@@ -16,7 +16,7 @@ class TFIDFTest < Test::Unit::TestCase
|
|
16
16
|
def test_arbitrary_text
|
17
17
|
#Just an arbitrary test on a single text, the number isn't definitive
|
18
18
|
tfidf = TFIDF.new @@text1
|
19
|
-
|
19
|
+
assert tfidf.terms.size > 50
|
20
20
|
end
|
21
21
|
|
22
22
|
|
@@ -38,6 +38,19 @@ class TFIDFTest < Test::Unit::TestCase
|
|
38
38
|
assert TFIDF.idf(1,2) == 0
|
39
39
|
assert TFIDF.idf(0,2) == 1
|
40
40
|
end
|
41
|
+
|
42
|
+
def test_terms_should_be_sorted
|
43
|
+
it = @tfidf.terms.values.each
|
44
|
+
begin
|
45
|
+
while true
|
46
|
+
e1 = it.next
|
47
|
+
e2 = it.peek
|
48
|
+
assert e2 > e1
|
49
|
+
end
|
50
|
+
rescue StopIteration => stop_it
|
51
|
+
#do nothing
|
52
|
+
end
|
53
|
+
end
|
41
54
|
|
42
55
|
def teardown
|
43
56
|
#I don't do nothing
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tfidf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-17 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: fast-stemmer
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,12 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
description: Calculate TF-IDF out of a text, resulting in a hash with term as key,
|
26
31
|
frequency as value. Sorry for taking the convenient name for myself! See examples/demo_tf.rb
|
27
32
|
for usage
|
@@ -30,6 +35,7 @@ executables: []
|
|
30
35
|
extensions: []
|
31
36
|
extra_rdoc_files: []
|
32
37
|
files:
|
38
|
+
- Rakefile
|
33
39
|
- lib/tfidf.rb
|
34
40
|
- examples/demo_tf.rb
|
35
41
|
- test/test_TFIDF.rb
|
@@ -53,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
59
|
version: '0'
|
54
60
|
requirements: []
|
55
61
|
rubyforge_project:
|
56
|
-
rubygems_version: 1.8.
|
62
|
+
rubygems_version: 1.8.19
|
57
63
|
signing_key:
|
58
64
|
specification_version: 3
|
59
65
|
summary: A W.I.P implementation of TF-IDF
|