tfidf 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -127,10 +127,10 @@ puts "Terms in the corpus"
127
127
  tfidf.terms.each {|k,v| puts "Term ID: #{k} => term: #{v}"}
128
128
 
129
129
  puts "Document-Term Matrix, sparse List of lists(LIL)"
130
- tfidf.sparse_matrix_doc_idx.each {|e| puts e}
130
+ tfidf.dtm.each {|e| puts e}
131
131
 
132
132
  puts "Term-Document Matrix, sparse LIL"
133
- tfidf.sparse_matrix_term_idx.each {|e| puts e}
133
+ tfidf.tdm.each {|e| puts e}
134
134
 
135
135
  puts "Term Frequency of word: video in document"
136
136
  puts tfidf.tf
@@ -140,3 +140,9 @@ puts tfidf.idf("octocat")
140
140
 
141
141
  puts "TF-IDF of word : octocats in document 7e38fa195cee92d2e7d834095d6938a89b5fdd58"
142
142
  puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
143
+
144
+ puts "DTM in dense matrix"
145
+ puts tfidf.dense
146
+
147
+
148
+
@@ -1,6 +1,8 @@
1
1
  require 'set'
2
2
  require 'fast_stemmer'
3
3
  require 'digest'
4
+ require 'active_support'
5
+ require 'gsl'
4
6
 
5
7
 
6
8
  class TFIDF
@@ -9,7 +11,7 @@ class TFIDF
9
11
  @@split_pattern = /[\W]/
10
12
 
11
13
  #Hash function used for generating id for documents as well as terms
12
- def hash_func(obj)
14
+ def self.hash_func(obj)
13
15
  return Digest::SHA1.hexdigest obj
14
16
  end
15
17
 
@@ -26,9 +28,8 @@ class TFIDF
26
28
  def initialize(corpus)
27
29
  @cardinality = 0
28
30
  @docs = {}
29
- @terms = {}
30
- @sparse_matrix_term_idx = {}
31
- @sparse_matrix_doc_idx = {}
31
+ @tdm = {}
32
+ @dtm = {}
32
33
  @idf = {}
33
34
 
34
35
  #not in use
@@ -41,33 +42,34 @@ class TFIDF
41
42
  else
42
43
  @cardinality = corpus.length
43
44
  end
44
- memo = corpus.reduce({:terms => {}, :docs => {}, :sparse_matrix_doc_idx => {}, :sparse_matrix_term_idx => {}}) do |memo, doc|
45
- doc_id = hash_func doc
45
+ memo = corpus.reduce({:terms => SortedSet.new, :docs => ActiveSupport::OrderedHash.new({}), :dtm => {}, :tdm => {}}) do |memo, doc|
46
+ doc_id = TFIDF.hash_func doc
46
47
  memo[:docs][doc_id] = doc
47
48
  tf_single_doc = TFIDF.tf_single(doc)
48
- memo[:sparse_matrix_doc_idx][doc_id] = tf_single_doc
49
+ memo[:dtm][doc_id] = tf_single_doc
49
50
  tf_single_doc.each do |keyvalue|
50
51
  term, freq = keyvalue
51
- term_id = hash_func term
52
- lambda {|x|
53
- if !x.has_key?(term_id)
54
- x[term_id] = term
55
- end}.call memo[:terms]
52
+ # term_id = TFIDF.hash_func term
53
+ memo[:terms].add term
56
54
  lambda {|x|
57
55
  if x[term] != nil
58
56
  x[term][doc_id] = freq
59
57
  else
60
58
  x[term] = {doc_id => freq}
61
59
  end
62
- }.call memo[:sparse_matrix_term_idx]
60
+ }.call memo[:tdm]
63
61
  end
64
62
  memo
65
63
  end
66
64
  @docs = memo[:docs]
67
- @terms = memo[:terms]
68
- @sparse_matrix_term_idx = memo[:sparse_matrix_term_idx]
69
- @sparse_matrix_doc_idx = memo[:sparse_matrix_doc_idx]
70
- @sparse_matrix_term_idx.each {|k, v|
65
+ @terms = ActiveSupport::OrderedHash.new({})
66
+ memo[:terms].each do |term|
67
+ term_id = TFIDF.hash_func term
68
+ @terms[term_id] = term
69
+ end
70
+ @tdm = memo[:tdm]
71
+ @dtm = memo[:dtm]
72
+ @tdm.each {|k, v|
71
73
  @idf[k] = TFIDF.idf(v.size, @cardinality)}
72
74
  end
73
75
 
@@ -112,14 +114,14 @@ class TFIDF
112
114
  return @terms
113
115
  end
114
116
 
115
- #Aka DTM, in sparse List of lists(LIL)
116
- def sparse_matrix_doc_idx
117
- return @sparse_matrix_doc_idx
117
+ #Document Term Matrix, in sparse List of lists(LIL)
118
+ def dtm
119
+ return @dtm
118
120
  end
119
121
 
120
- #Aka TDM, in sparse List of lists(LIL)
121
- def sparse_matrix_term_idx
122
- return @sparse_matrix_term_idx
122
+ #Term Document Matrix, in sparse List of lists(LIL)
123
+ def tdm
124
+ return @tdm
123
125
  end
124
126
 
125
127
  #=Arguments
@@ -143,11 +145,11 @@ class TFIDF
143
145
  # Everything
144
146
  def tf(term=nil, doc=nil)
145
147
  if term == nil || doc == nil
146
- return @sparse_matrix_doc_idx
148
+ return @dtm
147
149
  elsif term == nil
148
- return @sparse_matrix_doc_idx[doc]
150
+ return @dtm[doc]
149
151
  else
150
- return lambda {|x| (x == nil)?0:x}.call(@sparse_matrix_doc_idx[doc][term])
152
+ return lambda {|x| (x == nil)?0:x}.call(@dtm[doc][term])
151
153
  end
152
154
  end
153
155
 
@@ -167,6 +169,28 @@ class TFIDF
167
169
  def tfidf(term, doc)
168
170
  return tf(term,doc) * idf(term)
169
171
  end
172
+
173
+ #Access, or calculate if not present, a dense DTM in GSL::Matrix
174
+ #Each row corresponds to a document, each column a term
175
+ #Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
176
+ def dense()
177
+ return @dense_matrix unless @dense_matrix.nil?
178
+ dense_matrix = GSL::Matrix.alloc(@docs.size, @terms.size)
179
+ (0...@docs.size).each do |i|
180
+ doc_id = @docs.keys[i]
181
+ @dtm[doc_id].each do |term,freq|
182
+ idx = @terms.values.index term
183
+ dense_matrix.set([i,idx], freq)
184
+ end
185
+ end
186
+ @dense_matrix = dense_matrix
187
+ return @dense_matrix
188
+ end
189
+
190
+ #TODO: Merge 2 TFIDF objects
191
+ def self.merge()
192
+
193
+ end
170
194
 
171
195
  #Simply the formula for tf*idf
172
196
  def self.idf(x,cardinality)
@@ -1,11 +1,11 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'test/unit'
3
3
  require 'tfidf'
4
- =begin
4
+
5
5
  require 'ruby-debug'
6
6
  Debugger.start(:post_mortem => true)
7
7
  Debugger.settings[:autoeval] = true
8
- =end
8
+
9
9
 
10
10
  class TFIDFTest < Test::Unit::TestCase
11
11
  def setup
@@ -16,7 +16,7 @@ class TFIDFTest < Test::Unit::TestCase
16
16
  def test_arbitrary_text
17
17
  #Just an arbitrary test on a single text, the number isn't definitive
18
18
  tfidf = TFIDF.new @@text1
19
- pp tfidf.terms.size > 50
19
+ assert tfidf.terms.size > 50
20
20
  end
21
21
 
22
22
 
@@ -38,6 +38,19 @@ class TFIDFTest < Test::Unit::TestCase
38
38
  assert TFIDF.idf(1,2) == 0
39
39
  assert TFIDF.idf(0,2) == 1
40
40
  end
41
+
42
+ def test_terms_should_be_sorted
43
+ it = @tfidf.terms.values.each
44
+ begin
45
+ while true
46
+ e1 = it.next
47
+ e2 = it.peek
48
+ assert e2 > e1
49
+ end
50
+ rescue StopIteration => stop_it
51
+ #do nothing
52
+ end
53
+ end
41
54
 
42
55
  def teardown
43
56
  #I don't do nothing
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tfidf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fast-stemmer
16
- requirement: &16818040 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,12 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16818040
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  description: Calculate TF-IDF out of a text, resulting in a hash with term as key,
26
31
  frequency as value. Sorry for taking the convenient name for myself! See examples/demo_tf.rb
27
32
  for usage
@@ -30,6 +35,7 @@ executables: []
30
35
  extensions: []
31
36
  extra_rdoc_files: []
32
37
  files:
38
+ - Rakefile
33
39
  - lib/tfidf.rb
34
40
  - examples/demo_tf.rb
35
41
  - test/test_TFIDF.rb
@@ -53,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
59
  version: '0'
54
60
  requirements: []
55
61
  rubyforge_project:
56
- rubygems_version: 1.8.10
62
+ rubygems_version: 1.8.19
57
63
  signing_key:
58
64
  specification_version: 3
59
65
  summary: A W.I.P implementation of TF-IDF