sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,75 +0,0 @@
1
- require 'rubygems'
2
- require 'log4r'
3
-
4
- module SClust
5
- class DocumentCollection
6
-
7
- # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
8
- attr_reader :terms
9
-
10
- # A list of documents
11
- attr_reader :doclist
12
-
13
- # Log4r::Logger for this document collection.
14
- attr_reader :logger
15
-
16
- def initialize()
17
- @logger = Log4r::Logger.new("SClust::DocumentCollection")
18
- @terms = Hash.new(0)
19
- @doclist = []
20
- end
21
-
22
- # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
23
- # The document is also added to the @doclist attribute.
24
- def +(d)
25
-
26
- d.each_term do |term|
27
- @terms[term] += 1.0
28
- end
29
-
30
- @doclist<<d
31
-
32
- @logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
33
-
34
- self
35
- end
36
-
37
- def drop_terms(min_frequency=0.10, max_frequency=0.80)
38
-
39
- min_docs = @doclist.length * min_frequency
40
- max_docs = @doclist.length * max_frequency
41
-
42
- @logger.info("Analyzing #{@terms.length} terms for removal.")
43
- @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
44
-
45
- remove_list = []
46
-
47
- @terms.each do |term, frequency|
48
-
49
- if ( frequency < min_docs or frequency > max_docs )
50
- @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
51
- @terms.delete(term)
52
- remove_list << term
53
- end
54
- end
55
-
56
- @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
57
-
58
- @doclist.each do |doc|
59
- remove_list.each do |term|
60
- doc.terms.delete(term)
61
- end
62
- end
63
- end
64
-
65
- def inverse_document_frequency(term)
66
- Math.log( @terms.length / @terms[term] )
67
- end
68
-
69
- alias idf inverse_document_frequency
70
-
71
- def each_term(&c)
72
- @terms.each_key { |k| yield k }
73
- end
74
- end
75
- end