tfidf 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/demo_tf.rb +6 -4
- data/lib/tfidf.rb +32 -1
- metadata +2 -1
data/examples/demo_tf.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'tfidf'
|
3
3
|
require 'pp'
|
4
4
|
require 'ruby-debug'
|
5
|
+
require 'mongo_mapper'
|
5
6
|
Debugger.settings[:autoeval] = true
|
6
7
|
Debugger.start
|
7
8
|
Debugger.post_mortem
|
@@ -119,7 +120,7 @@ Robison, a veteran deep-sea ecologist, said that whatever does live in the trenc
|
|
119
120
|
Much of the deepest ocean is unreachable via state-owned submersibles, which at this point can dive no more than 21,000 feet (6,500 m). Only Japan's Shinkai 6500 has reached such depths. The United States is refurbishing Alvin, its deepest-diving craft, to be able to reach 21,000 feet within th_ ]
|
120
121
|
|
121
122
|
tfidf = TFIDF.new corpus
|
122
|
-
|
123
|
+
=begin
|
123
124
|
puts "Documents in the corpus:"
|
124
125
|
tfidf.docs.each {|k,v| puts "Document ID: #{k} => term: #{v}"}
|
125
126
|
|
@@ -143,6 +144,7 @@ puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
|
|
143
144
|
|
144
145
|
puts "DTM in dense matrix"
|
145
146
|
puts tfidf.dense
|
146
|
-
|
147
|
-
|
148
|
-
|
147
|
+
=end
|
148
|
+
tfidf_mongo = Object.to_mongo tfidf
|
149
|
+
debugger
|
150
|
+
puts tfidf_mongo.class
|
data/lib/tfidf.rb
CHANGED
@@ -14,7 +14,7 @@ class TFIDF
|
|
14
14
|
def self.hash_func(obj)
|
15
15
|
return Digest::SHA1.hexdigest obj
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
#=Arguments
|
19
19
|
# corpus: an array of strings, one string per document
|
20
20
|
#=Returns
|
@@ -73,6 +73,19 @@ class TFIDF
|
|
73
73
|
@idf[k] = TFIDF.idf(v.size, @cardinality)}
|
74
74
|
end
|
75
75
|
|
76
|
+
#TODO: really make this work as intended
|
77
|
+
def self.from_tf(tf_in_hash)
|
78
|
+
@dtm = OrderedHash.new
|
79
|
+
@docs = OrderedHash.new
|
80
|
+
@terms = SortedSet.new
|
81
|
+
tf_in_hash.map {|e|
|
82
|
+
id = TFIDF.hash_func Time.now
|
83
|
+
@dtm[id] = e
|
84
|
+
@docs[id] = "Place holder"
|
85
|
+
e.keys.map {|t| @terms.add t}
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
76
89
|
#Build a TF vector out of a single document(String)
|
77
90
|
#=Argument:
|
78
91
|
# String valued document
|
@@ -170,6 +183,24 @@ class TFIDF
|
|
170
183
|
return tf(term,doc) * idf(term)
|
171
184
|
end
|
172
185
|
|
186
|
+
def self.sparse_to_dense(array_of_ordered_hashes)
|
187
|
+
terms = SortedSet.new
|
188
|
+
array_of_ordered_hashes.each {|ordered_hash|
|
189
|
+
ordered_hash.keys.each {|k| terms.add(k)}}
|
190
|
+
terms = terms.to_a
|
191
|
+
n = array_of_ordered_hashes.length
|
192
|
+
d = terms.length
|
193
|
+
dense_matrix = GSL::Matrix.alloc(n,d)
|
194
|
+
(0...n).each do |i|
|
195
|
+
array_of_ordered_hashes[i].each do |term, freq|
|
196
|
+
idx = terms.index term
|
197
|
+
dense_matrix.set([i,idx], freq)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
return dense_matrix
|
201
|
+
end
|
202
|
+
|
203
|
+
|
173
204
|
#Access, or calculate if not present, a dense DTM in GSL::Matrix
|
174
205
|
#Each row corresponds to a document, each column a term
|
175
206
|
#Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tfidf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -64,3 +64,4 @@ signing_key:
|
|
64
64
|
specification_version: 3
|
65
65
|
summary: A W.I.P implementation of TF-IDF
|
66
66
|
test_files: []
|
67
|
+
has_rdoc:
|