tfidf 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/examples/demo_tf.rb +6 -4
  2. data/lib/tfidf.rb +32 -1
  3. metadata +2 -1
@@ -2,6 +2,7 @@
2
2
  require 'tfidf'
3
3
  require 'pp'
4
4
  require 'ruby-debug'
5
+ require 'mongo_mapper'
5
6
  Debugger.settings[:autoeval] = true
6
7
  Debugger.start
7
8
  Debugger.post_mortem
@@ -119,7 +120,7 @@ Robison, a veteran deep-sea ecologist, said that whatever does live in the trenc
119
120
  Much of the deepest ocean is unreachable via state-owned submersibles, which at this point can dive no more than 21,000 feet (6,500 m). Only Japan's Shinkai 6500 has reached such depths. The United States is refurbishing Alvin, its deepest-diving craft, to be able to reach 21,000 feet within th_ ]
120
121
 
121
122
  tfidf = TFIDF.new corpus
122
-
123
+ =begin
123
124
  puts "Documents in the corpus:"
124
125
  tfidf.docs.each {|k,v| puts "Document ID: #{k} => term: #{v}"}
125
126
 
@@ -143,6 +144,7 @@ puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
143
144
 
144
145
  puts "DTM in dense matrix"
145
146
  puts tfidf.dense
146
-
147
-
148
-
147
+ =end
148
+ tfidf_mongo = Object.to_mongo tfidf
149
+ debugger
150
+ puts tfidf_mongo.class
@@ -14,7 +14,7 @@ class TFIDF
14
14
  def self.hash_func(obj)
15
15
  return Digest::SHA1.hexdigest obj
16
16
  end
17
-
17
+
18
18
  #=Arguments
19
19
  # corpus: an array of strings, one string per document
20
20
  #=Returns
@@ -73,6 +73,19 @@ class TFIDF
73
73
  @idf[k] = TFIDF.idf(v.size, @cardinality)}
74
74
  end
75
75
 
76
+ #TODO: really make this work as intended
77
+ def self.from_tf(tf_in_hash)
78
+ @dtm = OrderedHash.new
79
+ @docs = OrderedHash.new
80
+ @terms = SortedSet.new
81
+ tf_in_hash.map {|e|
82
+ id = TFIDF.hash_func Time.now
83
+ @dtm[id] = e
84
+ @docs[id] = "Place holder"
85
+ e.keys.map {|t| @terms.add t}
86
+ }
87
+ end
88
+
76
89
  #Build a TF vector out of a single document(String)
77
90
  #=Argument:
78
91
  # String valued document
@@ -170,6 +183,24 @@ class TFIDF
170
183
  return tf(term,doc) * idf(term)
171
184
  end
172
185
 
186
+ def self.sparse_to_dense(array_of_ordered_hashes)
187
+ terms = SortedSet.new
188
+ array_of_ordered_hashes.each {|ordered_hash|
189
+ ordered_hash.keys.each {|k| terms.add(k)}}
190
+ terms = terms.to_a
191
+ n = array_of_ordered_hashes.length
192
+ d = terms.length
193
+ dense_matrix = GSL::Matrix.alloc(n,d)
194
+ (0...n).each do |i|
195
+ array_of_ordered_hashes[i].each do |term, freq|
196
+ idx = terms.index term
197
+ dense_matrix.set([i,idx], freq)
198
+ end
199
+ end
200
+ return dense_matrix
201
+ end
202
+
203
+
173
204
  #Access, or calculate if not present, a dense DTM in GSL::Matrix
174
205
  #Each row corresponds to a document, each column a term
175
206
  #Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tfidf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -64,3 +64,4 @@ signing_key:
64
64
  specification_version: 3
65
65
  summary: A W.I.P implementation of TF-IDF
66
66
  test_files: []
67
+ has_rdoc: