sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,75 +0,0 @@
1
- require 'rubygems'
2
- require 'log4r'
3
-
4
- module SClust
5
- class DocumentCollection
6
-
7
- # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
8
- attr_reader :terms
9
-
10
- # A list of documents
11
- attr_reader :doclist
12
-
13
- # Log4r::Logger for this document collection.
14
- attr_reader :logger
15
-
16
- def initialize()
17
- @logger = Log4r::Logger.new("SClust::DocumentCollection")
18
- @terms = Hash.new(0)
19
- @doclist = []
20
- end
21
-
22
- # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
23
- # The document is also added to the @doclist attribute.
24
- def +(d)
25
-
26
- d.each_term do |term|
27
- @terms[term] += 1.0
28
- end
29
-
30
- @doclist<<d
31
-
32
- @logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
33
-
34
- self
35
- end
36
-
37
- def drop_terms(min_frequency=0.10, max_frequency=0.80)
38
-
39
- min_docs = @doclist.length * min_frequency
40
- max_docs = @doclist.length * max_frequency
41
-
42
- @logger.info("Analyzing #{@terms.length} terms for removal.")
43
- @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
44
-
45
- remove_list = []
46
-
47
- @terms.each do |term, frequency|
48
-
49
- if ( frequency < min_docs or frequency > max_docs )
50
- @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
51
- @terms.delete(term)
52
- remove_list << term
53
- end
54
- end
55
-
56
- @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
57
-
58
- @doclist.each do |doc|
59
- remove_list.each do |term|
60
- doc.terms.delete(term)
61
- end
62
- end
63
- end
64
-
65
- def inverse_document_frequency(term)
66
- Math.log( @terms.length / @terms[term] )
67
- end
68
-
69
- alias idf inverse_document_frequency
70
-
71
- def each_term(&c)
72
- @terms.each_key { |k| yield k }
73
- end
74
- end
75
- end