tfidf 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +8 -0
- data/examples/demo_tf.rb +8 -2
- data/lib/tfidf.rb +50 -26
- data/test/test_TFIDF.rb +16 -3
- metadata +10 -4
data/Rakefile
ADDED
data/examples/demo_tf.rb
CHANGED
@@ -127,10 +127,10 @@ puts "Terms in the corpus"
|
|
127
127
|
tfidf.terms.each {|k,v| puts "Term ID: #{k} => term: #{v}"}
|
128
128
|
|
129
129
|
puts "Document-Term Matrix, sparse List of lists(LIL)"
|
130
|
-
tfidf.
|
130
|
+
tfidf.dtm.each {|e| puts e}
|
131
131
|
|
132
132
|
puts "Term-Document Matrix, sparse LIL"
|
133
|
-
tfidf.
|
133
|
+
tfidf.tdm.each {|e| puts e}
|
134
134
|
|
135
135
|
puts "Term Frequency of word: video in document"
|
136
136
|
puts tfidf.tf
|
@@ -140,3 +140,9 @@ puts tfidf.idf("octocat")
|
|
140
140
|
|
141
141
|
puts "TF-IDF of word : octocats in document 7e38fa195cee92d2e7d834095d6938a89b5fdd58"
|
142
142
|
puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
|
143
|
+
|
144
|
+
puts "DTM in dense matrix"
|
145
|
+
puts tfidf.dense
|
146
|
+
|
147
|
+
|
148
|
+
|
data/lib/tfidf.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'fast_stemmer'
|
3
3
|
require 'digest'
|
4
|
+
require 'active_support'
|
5
|
+
require 'gsl'
|
4
6
|
|
5
7
|
|
6
8
|
class TFIDF
|
@@ -9,7 +11,7 @@ class TFIDF
|
|
9
11
|
@@split_pattern = /[\W]/
|
10
12
|
|
11
13
|
#Hash function used for generating id for documents as well as terms
|
12
|
-
def hash_func(obj)
|
14
|
+
def self.hash_func(obj)
|
13
15
|
return Digest::SHA1.hexdigest obj
|
14
16
|
end
|
15
17
|
|
@@ -26,9 +28,8 @@ class TFIDF
|
|
26
28
|
def initialize(corpus)
|
27
29
|
@cardinality = 0
|
28
30
|
@docs = {}
|
29
|
-
@
|
30
|
-
@
|
31
|
-
@sparse_matrix_doc_idx = {}
|
31
|
+
@tdm = {}
|
32
|
+
@dtm = {}
|
32
33
|
@idf = {}
|
33
34
|
|
34
35
|
#not in use
|
@@ -41,33 +42,34 @@ class TFIDF
|
|
41
42
|
else
|
42
43
|
@cardinality = corpus.length
|
43
44
|
end
|
44
|
-
memo = corpus.reduce({:terms =>
|
45
|
-
doc_id = hash_func doc
|
45
|
+
memo = corpus.reduce({:terms => SortedSet.new, :docs => ActiveSupport::OrderedHash.new({}), :dtm => {}, :tdm => {}}) do |memo, doc|
|
46
|
+
doc_id = TFIDF.hash_func doc
|
46
47
|
memo[:docs][doc_id] = doc
|
47
48
|
tf_single_doc = TFIDF.tf_single(doc)
|
48
|
-
memo[:
|
49
|
+
memo[:dtm][doc_id] = tf_single_doc
|
49
50
|
tf_single_doc.each do |keyvalue|
|
50
51
|
term, freq = keyvalue
|
51
|
-
term_id = hash_func term
|
52
|
-
|
53
|
-
if !x.has_key?(term_id)
|
54
|
-
x[term_id] = term
|
55
|
-
end}.call memo[:terms]
|
52
|
+
# term_id = TFIDF.hash_func term
|
53
|
+
memo[:terms].add term
|
56
54
|
lambda {|x|
|
57
55
|
if x[term] != nil
|
58
56
|
x[term][doc_id] = freq
|
59
57
|
else
|
60
58
|
x[term] = {doc_id => freq}
|
61
59
|
end
|
62
|
-
}.call memo[:
|
60
|
+
}.call memo[:tdm]
|
63
61
|
end
|
64
62
|
memo
|
65
63
|
end
|
66
64
|
@docs = memo[:docs]
|
67
|
-
@terms =
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
@terms = ActiveSupport::OrderedHash.new({})
|
66
|
+
memo[:terms].each do |term|
|
67
|
+
term_id = TFIDF.hash_func term
|
68
|
+
@terms[term_id] = term
|
69
|
+
end
|
70
|
+
@tdm = memo[:tdm]
|
71
|
+
@dtm = memo[:dtm]
|
72
|
+
@tdm.each {|k, v|
|
71
73
|
@idf[k] = TFIDF.idf(v.size, @cardinality)}
|
72
74
|
end
|
73
75
|
|
@@ -112,14 +114,14 @@ class TFIDF
|
|
112
114
|
return @terms
|
113
115
|
end
|
114
116
|
|
115
|
-
#
|
116
|
-
def
|
117
|
-
return @
|
117
|
+
#Document Term Matrix, in sparse List of lists(LIL)
|
118
|
+
def dtm
|
119
|
+
return @dtm
|
118
120
|
end
|
119
121
|
|
120
|
-
#
|
121
|
-
def
|
122
|
-
return @
|
122
|
+
#Term Document Matrix, in sparse List of lists(LIL)
|
123
|
+
def tdm
|
124
|
+
return @tdm
|
123
125
|
end
|
124
126
|
|
125
127
|
#=Arguments
|
@@ -143,11 +145,11 @@ class TFIDF
|
|
143
145
|
# Everything
|
144
146
|
def tf(term=nil, doc=nil)
|
145
147
|
if term == nil || doc == nil
|
146
|
-
return @
|
148
|
+
return @dtm
|
147
149
|
elsif term == nil
|
148
|
-
return @
|
150
|
+
return @dtm[doc]
|
149
151
|
else
|
150
|
-
return lambda {|x| (x == nil)?0:x}.call(@
|
152
|
+
return lambda {|x| (x == nil)?0:x}.call(@dtm[doc][term])
|
151
153
|
end
|
152
154
|
end
|
153
155
|
|
@@ -167,6 +169,28 @@ class TFIDF
|
|
167
169
|
def tfidf(term, doc)
|
168
170
|
return tf(term,doc) * idf(term)
|
169
171
|
end
|
172
|
+
|
173
|
+
#Access, or calculate if not present, a dense DTM in GSL::Matrix
|
174
|
+
#Each row corresponds to a document, each column a term
|
175
|
+
#Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
|
176
|
+
def dense()
|
177
|
+
return @dense_matrix unless @dense_matrix.nil?
|
178
|
+
dense_matrix = GSL::Matrix.alloc(@docs.size, @terms.size)
|
179
|
+
(0...@docs.size).each do |i|
|
180
|
+
doc_id = @docs.keys[i]
|
181
|
+
@dtm[doc_id].each do |term,freq|
|
182
|
+
idx = @terms.values.index term
|
183
|
+
dense_matrix.set([i,idx], freq)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
@dense_matrix = dense_matrix
|
187
|
+
return @dense_matrix
|
188
|
+
end
|
189
|
+
|
190
|
+
#TODO: Merge 2 TFIDF objects
|
191
|
+
def self.merge()
|
192
|
+
|
193
|
+
end
|
170
194
|
|
171
195
|
#Simply the formula for tf*idf
|
172
196
|
def self.idf(x,cardinality)
|
data/test/test_TFIDF.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'test/unit'
|
3
3
|
require 'tfidf'
|
4
|
-
|
4
|
+
|
5
5
|
require 'ruby-debug'
|
6
6
|
Debugger.start(:post_mortem => true)
|
7
7
|
Debugger.settings[:autoeval] = true
|
8
|
-
|
8
|
+
|
9
9
|
|
10
10
|
class TFIDFTest < Test::Unit::TestCase
|
11
11
|
def setup
|
@@ -16,7 +16,7 @@ class TFIDFTest < Test::Unit::TestCase
|
|
16
16
|
def test_arbitrary_text
|
17
17
|
#Just an arbitrary test on a single text, the number isn't definitive
|
18
18
|
tfidf = TFIDF.new @@text1
|
19
|
-
|
19
|
+
assert tfidf.terms.size > 50
|
20
20
|
end
|
21
21
|
|
22
22
|
|
@@ -38,6 +38,19 @@ class TFIDFTest < Test::Unit::TestCase
|
|
38
38
|
assert TFIDF.idf(1,2) == 0
|
39
39
|
assert TFIDF.idf(0,2) == 1
|
40
40
|
end
|
41
|
+
|
42
|
+
def test_terms_should_be_sorted
|
43
|
+
it = @tfidf.terms.values.each
|
44
|
+
begin
|
45
|
+
while true
|
46
|
+
e1 = it.next
|
47
|
+
e2 = it.peek
|
48
|
+
assert e2 > e1
|
49
|
+
end
|
50
|
+
rescue StopIteration => stop_it
|
51
|
+
#do nothing
|
52
|
+
end
|
53
|
+
end
|
41
54
|
|
42
55
|
def teardown
|
43
56
|
#I don't do nothing
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tfidf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-17 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: fast-stemmer
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,12 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
description: Calculate TF-IDF out of a text, resulting in a hash with term as key,
|
26
31
|
frequency as value. Sorry for taking the convenient name for myself! See examples/demo_tf.rb
|
27
32
|
for usage
|
@@ -30,6 +35,7 @@ executables: []
|
|
30
35
|
extensions: []
|
31
36
|
extra_rdoc_files: []
|
32
37
|
files:
|
38
|
+
- Rakefile
|
33
39
|
- lib/tfidf.rb
|
34
40
|
- examples/demo_tf.rb
|
35
41
|
- test/test_TFIDF.rb
|
@@ -53,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
59
|
version: '0'
|
54
60
|
requirements: []
|
55
61
|
rubyforge_project:
|
56
|
-
rubygems_version: 1.8.
|
62
|
+
rubygems_version: 1.8.19
|
57
63
|
signing_key:
|
58
64
|
specification_version: 3
|
59
65
|
summary: A W.I.P implementation of TF-IDF
|