tfidf 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -127,10 +127,10 @@ puts "Terms in the corpus"
127
127
  tfidf.terms.each {|k,v| puts "Term ID: #{k} => term: #{v}"}
128
128
 
129
129
  puts "Document-Term Matrix, sparse List of lists(LIL)"
130
- tfidf.sparse_matrix_doc_idx.each {|e| puts e}
130
+ tfidf.dtm.each {|e| puts e}
131
131
 
132
132
  puts "Term-Document Matrix, sparse LIL"
133
- tfidf.sparse_matrix_term_idx.each {|e| puts e}
133
+ tfidf.tdm.each {|e| puts e}
134
134
 
135
135
  puts "Term Frequency of word: video in document"
136
136
  puts tfidf.tf
@@ -140,3 +140,9 @@ puts tfidf.idf("octocat")
140
140
 
141
141
  puts "TF-IDF of word : octocats in document 7e38fa195cee92d2e7d834095d6938a89b5fdd58"
142
142
  puts tfidf.tfidf("octocat","7e38fa195cee92d2e7d834095d6938a89b5fdd58")
143
+
144
+ puts "DTM in dense matrix"
145
+ puts tfidf.dense
146
+
147
+
148
+
@@ -1,6 +1,8 @@
1
1
  require 'set'
2
2
  require 'fast_stemmer'
3
3
  require 'digest'
4
+ require 'active_support'
5
+ require 'gsl'
4
6
 
5
7
 
6
8
  class TFIDF
@@ -9,7 +11,7 @@ class TFIDF
9
11
  @@split_pattern = /[\W]/
10
12
 
11
13
  #Hash function used for generating id for documents as well as terms
12
- def hash_func(obj)
14
+ def self.hash_func(obj)
13
15
  return Digest::SHA1.hexdigest obj
14
16
  end
15
17
 
@@ -26,9 +28,8 @@ class TFIDF
26
28
  def initialize(corpus)
27
29
  @cardinality = 0
28
30
  @docs = {}
29
- @terms = {}
30
- @sparse_matrix_term_idx = {}
31
- @sparse_matrix_doc_idx = {}
31
+ @tdm = {}
32
+ @dtm = {}
32
33
  @idf = {}
33
34
 
34
35
  #not in use
@@ -41,33 +42,34 @@ class TFIDF
41
42
  else
42
43
  @cardinality = corpus.length
43
44
  end
44
- memo = corpus.reduce({:terms => {}, :docs => {}, :sparse_matrix_doc_idx => {}, :sparse_matrix_term_idx => {}}) do |memo, doc|
45
- doc_id = hash_func doc
45
+ memo = corpus.reduce({:terms => SortedSet.new, :docs => ActiveSupport::OrderedHash.new({}), :dtm => {}, :tdm => {}}) do |memo, doc|
46
+ doc_id = TFIDF.hash_func doc
46
47
  memo[:docs][doc_id] = doc
47
48
  tf_single_doc = TFIDF.tf_single(doc)
48
- memo[:sparse_matrix_doc_idx][doc_id] = tf_single_doc
49
+ memo[:dtm][doc_id] = tf_single_doc
49
50
  tf_single_doc.each do |keyvalue|
50
51
  term, freq = keyvalue
51
- term_id = hash_func term
52
- lambda {|x|
53
- if !x.has_key?(term_id)
54
- x[term_id] = term
55
- end}.call memo[:terms]
52
+ # term_id = TFIDF.hash_func term
53
+ memo[:terms].add term
56
54
  lambda {|x|
57
55
  if x[term] != nil
58
56
  x[term][doc_id] = freq
59
57
  else
60
58
  x[term] = {doc_id => freq}
61
59
  end
62
- }.call memo[:sparse_matrix_term_idx]
60
+ }.call memo[:tdm]
63
61
  end
64
62
  memo
65
63
  end
66
64
  @docs = memo[:docs]
67
- @terms = memo[:terms]
68
- @sparse_matrix_term_idx = memo[:sparse_matrix_term_idx]
69
- @sparse_matrix_doc_idx = memo[:sparse_matrix_doc_idx]
70
- @sparse_matrix_term_idx.each {|k, v|
65
+ @terms = ActiveSupport::OrderedHash.new({})
66
+ memo[:terms].each do |term|
67
+ term_id = TFIDF.hash_func term
68
+ @terms[term_id] = term
69
+ end
70
+ @tdm = memo[:tdm]
71
+ @dtm = memo[:dtm]
72
+ @tdm.each {|k, v|
71
73
  @idf[k] = TFIDF.idf(v.size, @cardinality)}
72
74
  end
73
75
 
@@ -112,14 +114,14 @@ class TFIDF
112
114
  return @terms
113
115
  end
114
116
 
115
- #Aka DTM, in sparse List of lists(LIL)
116
- def sparse_matrix_doc_idx
117
- return @sparse_matrix_doc_idx
117
+ #Document Term Matrix, in sparse List of lists(LIL)
118
+ def dtm
119
+ return @dtm
118
120
  end
119
121
 
120
- #Aka TDM, in sparse List of lists(LIL)
121
- def sparse_matrix_term_idx
122
- return @sparse_matrix_term_idx
122
+ #Term Document Matrix, in sparse List of lists(LIL)
123
+ def tdm
124
+ return @tdm
123
125
  end
124
126
 
125
127
  #=Arguments
@@ -143,11 +145,11 @@ class TFIDF
143
145
  # Everything
144
146
  def tf(term=nil, doc=nil)
145
147
  if term == nil || doc == nil
146
- return @sparse_matrix_doc_idx
148
+ return @dtm
147
149
  elsif term == nil
148
- return @sparse_matrix_doc_idx[doc]
150
+ return @dtm[doc]
149
151
  else
150
- return lambda {|x| (x == nil)?0:x}.call(@sparse_matrix_doc_idx[doc][term])
152
+ return lambda {|x| (x == nil)?0:x}.call(@dtm[doc][term])
151
153
  end
152
154
  end
153
155
 
@@ -167,6 +169,28 @@ class TFIDF
167
169
  def tfidf(term, doc)
168
170
  return tf(term,doc) * idf(term)
169
171
  end
172
+
173
+ #Access, or calculate if not present, a dense DTM in GSL::Matrix
174
+ #Each row corresponds to a document, each column a term
175
+ #Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term
176
+ def dense()
177
+ return @dense_matrix unless @dense_matrix.nil?
178
+ dense_matrix = GSL::Matrix.alloc(@docs.size, @terms.size)
179
+ (0...@docs.size).each do |i|
180
+ doc_id = @docs.keys[i]
181
+ @dtm[doc_id].each do |term,freq|
182
+ idx = @terms.values.index term
183
+ dense_matrix.set([i,idx], freq)
184
+ end
185
+ end
186
+ @dense_matrix = dense_matrix
187
+ return @dense_matrix
188
+ end
189
+
190
+ #TODO: Merge 2 TFIDF objects
191
+ def self.merge()
192
+
193
+ end
170
194
 
171
195
  #Simply the formula for tf*idf
172
196
  def self.idf(x,cardinality)
@@ -1,11 +1,11 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'test/unit'
3
3
  require 'tfidf'
4
- =begin
4
+
5
5
  require 'ruby-debug'
6
6
  Debugger.start(:post_mortem => true)
7
7
  Debugger.settings[:autoeval] = true
8
- =end
8
+
9
9
 
10
10
  class TFIDFTest < Test::Unit::TestCase
11
11
  def setup
@@ -16,7 +16,7 @@ class TFIDFTest < Test::Unit::TestCase
16
16
  def test_arbitrary_text
17
17
  #Just an arbitrary test on a single text, the number isn't definitive
18
18
  tfidf = TFIDF.new @@text1
19
- pp tfidf.terms.size > 50
19
+ assert tfidf.terms.size > 50
20
20
  end
21
21
 
22
22
 
@@ -38,6 +38,19 @@ class TFIDFTest < Test::Unit::TestCase
38
38
  assert TFIDF.idf(1,2) == 0
39
39
  assert TFIDF.idf(0,2) == 1
40
40
  end
41
+
42
+ def test_terms_should_be_sorted
43
+ it = @tfidf.terms.values.each
44
+ begin
45
+ while true
46
+ e1 = it.next
47
+ e2 = it.peek
48
+ assert e2 > e1
49
+ end
50
+ rescue StopIteration => stop_it
51
+ #do nothing
52
+ end
53
+ end
41
54
 
42
55
  def teardown
43
56
  #I don't do nothing
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tfidf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fast-stemmer
16
- requirement: &16818040 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,12 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16818040
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  description: Calculate TF-IDF out of a text, resulting in a hash with term as key,
26
31
  frequency as value. Sorry for taking the convenient name for myself! See examples/demo_tf.rb
27
32
  for usage
@@ -30,6 +35,7 @@ executables: []
30
35
  extensions: []
31
36
  extra_rdoc_files: []
32
37
  files:
38
+ - Rakefile
33
39
  - lib/tfidf.rb
34
40
  - examples/demo_tf.rb
35
41
  - test/test_TFIDF.rb
@@ -53,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
59
  version: '0'
54
60
  requirements: []
55
61
  rubyforge_project:
56
- rubygems_version: 1.8.10
62
+ rubygems_version: 1.8.19
57
63
  signing_key:
58
64
  specification_version: 3
59
65
  summary: A W.I.P implementation of TF-IDF