classifier 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +341 -0
- data/README +59 -6
- data/Rakefile +16 -4
- data/bin/bayes.rb +8 -2
- data/doc/classes/Classifier.html +15 -10
- data/doc/classes/Classifier/Bayes.html +68 -38
- data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
- data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
- data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
- data/doc/classes/Classifier/ContentNode.html +252 -0
- data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
- data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
- data/doc/classes/Classifier/LSI.html +449 -0
- data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
- data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
- data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
- data/doc/classes/Classifier/WordList.html +202 -0
- data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
- data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
- data/doc/classes/GSL.html +111 -0
- data/doc/classes/GSL/Vector.html +156 -0
- data/doc/classes/GSL/Vector.src/M000005.html +18 -0
- data/doc/classes/GSL/Vector.src/M000006.html +19 -0
- data/doc/classes/Object.html +139 -0
- data/doc/classes/Object.src/M000001.html +16 -0
- data/doc/classes/String.html +95 -9
- data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
- data/doc/classes/String.src/M000003.html +18 -0
- data/doc/classes/String.src/M000004.html +18 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +102 -12
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
- data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
- data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
- data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
- data/doc/files/lib/classifier/lsi_rb.html +125 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
- data/doc/files/lib/classifier_rb.html +3 -1
- data/doc/fr_class_index.html +6 -2
- data/doc/fr_file_index.html +5 -2
- data/doc/fr_method_index.html +34 -11
- data/lib/classifier.rb +3 -1
- data/lib/classifier/bayes.rb +34 -9
- data/lib/classifier/extensions/vector_serialize.rb +14 -0
- data/lib/classifier/extensions/word_hash.rb +125 -0
- data/lib/classifier/extensions/word_list.rb +31 -0
- data/lib/classifier/lsi.rb +248 -0
- data/lib/classifier/lsi/content_node.rb +67 -0
- data/lib/classifier/string_extensions.rb +10 -5
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/lsi/lsi_test.rb +88 -0
- data/test/string_extensions/word_hash_test.rb +7 -5
- metadata +79 -24
- data/doc/classes/Classifier/Stemmable.html +0 -243
- data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
- data/doc/classes/Classifier/WordHash.html +0 -178
- data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
- data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
- data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,119 +0,0 @@
|
|
1
|
-
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
-
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
-
# License:: LGPL
|
4
|
-
|
5
|
-
module Classifier
|
6
|
-
|
7
|
-
# This module is mixed into String to provide convenience
|
8
|
-
# methods for the Classifier package.
|
9
|
-
module WordHash
|
10
|
-
|
11
|
-
# Removes common punctuation symbols, returning a new string. E.g.,
|
12
|
-
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
13
|
-
# => "Hello greetings with braces "
|
14
|
-
def without_punctuation
|
15
|
-
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
16
|
-
end
|
17
|
-
|
18
|
-
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
19
|
-
# interned, and indexes to its frequency in the document.
|
20
|
-
def word_hash
|
21
|
-
d = Hash.new
|
22
|
-
corpus = without_punctuation
|
23
|
-
(corpus.split + gsub(/[\w+]/,"").split).each do |word|
|
24
|
-
item = word.downcase
|
25
|
-
key = item.stem.intern
|
26
|
-
if !(word =~ /[\w+]/) || word.length > 2
|
27
|
-
d[key] ||= 0
|
28
|
-
d[key] += 1
|
29
|
-
end unless CORPUS_SKIP_WORDS[item]
|
30
|
-
end
|
31
|
-
return d
|
32
|
-
end
|
33
|
-
|
34
|
-
private
|
35
|
-
CORPUS_SKIP_WORDS = {
|
36
|
-
"a" => 1,
|
37
|
-
"again" => 1,
|
38
|
-
"all" => 1,
|
39
|
-
"along" => 1,
|
40
|
-
"are" => 1,
|
41
|
-
"also" => 1,
|
42
|
-
"an" => 1,
|
43
|
-
"and" => 1,
|
44
|
-
"as" => 1,
|
45
|
-
"at" => 1,
|
46
|
-
"but" => 1,
|
47
|
-
"by" => 1,
|
48
|
-
"came" => 1,
|
49
|
-
"can" => 1,
|
50
|
-
"cant" => 1,
|
51
|
-
"couldnt" => 1,
|
52
|
-
"did" => 1,
|
53
|
-
"didn" => 1,
|
54
|
-
"didnt" => 1,
|
55
|
-
"do" => 1,
|
56
|
-
"doesnt" => 1,
|
57
|
-
"dont" => 1,
|
58
|
-
"ever" => 1,
|
59
|
-
"first" => 1,
|
60
|
-
"from" => 1,
|
61
|
-
"have" => 1,
|
62
|
-
"her" => 1,
|
63
|
-
"here" => 1,
|
64
|
-
"him" => 1,
|
65
|
-
"how" => 1,
|
66
|
-
"i" => 1,
|
67
|
-
"if" => 1,
|
68
|
-
"in" => 1,
|
69
|
-
"into" => 1,
|
70
|
-
"is" => 1,
|
71
|
-
"isnt" => 1,
|
72
|
-
"it" => 1,
|
73
|
-
"itll" => 1,
|
74
|
-
"just" => 1,
|
75
|
-
"last" => 1,
|
76
|
-
"least" => 1,
|
77
|
-
"like" => 1,
|
78
|
-
"most" => 1,
|
79
|
-
"my" => 1,
|
80
|
-
"new" => 1,
|
81
|
-
"no" => 1,
|
82
|
-
"not" => 1,
|
83
|
-
"now" => 1,
|
84
|
-
"of" => 1,
|
85
|
-
"on" => 1,
|
86
|
-
"or" => 1,
|
87
|
-
"should" => 1,
|
88
|
-
"sinc" => 1,
|
89
|
-
"so" => 1,
|
90
|
-
"some" => 1,
|
91
|
-
"th" => 1,
|
92
|
-
"than" => 1,
|
93
|
-
"this" => 1,
|
94
|
-
"that" => 1,
|
95
|
-
"the" => 1,
|
96
|
-
"their" => 1,
|
97
|
-
"then" => 1,
|
98
|
-
"those" => 1,
|
99
|
-
"to" => 1,
|
100
|
-
"told" => 1,
|
101
|
-
"too" => 1,
|
102
|
-
"true" => 1,
|
103
|
-
"try" => 1,
|
104
|
-
"until" => 1,
|
105
|
-
"url" => 1,
|
106
|
-
"us" => 1,
|
107
|
-
"were" => 1,
|
108
|
-
"when" => 1,
|
109
|
-
"whether" => 1,
|
110
|
-
"while" => 1,
|
111
|
-
"with" => 1,
|
112
|
-
"within" => 1,
|
113
|
-
"yes" => 1,
|
114
|
-
"you" => 1,
|
115
|
-
"youll" => 1,
|
116
|
-
}
|
117
|
-
end
|
118
|
-
|
119
|
-
end
|