groupie 0.1.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/groupie.gemspec +1 -1
- data/lib/groupie.rb +28 -7
- data/spec/groupie_spec.rb +55 -0
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.2
|
data/groupie.gemspec
CHANGED
data/lib/groupie.rb
CHANGED
@@ -12,39 +12,60 @@ class Groupie
|
|
12
12
|
@groups[group] ||= Group.new(group)
|
13
13
|
end
|
14
14
|
|
15
|
-
def classify(entry)
|
15
|
+
def classify(entry, strategy=:sum)
|
16
16
|
results = {}
|
17
17
|
total_count = @groups.inject(0) do |sum, name_group|
|
18
18
|
group = name_group.last
|
19
|
-
|
19
|
+
count = group.count(entry)
|
20
|
+
if strategy==:sum
|
21
|
+
sum += count
|
22
|
+
elsif strategy==:sqrt
|
23
|
+
sum += Math::sqrt(count)
|
24
|
+
elsif strategy==:log
|
25
|
+
sum += Math::log10(count) if count > 0
|
26
|
+
else
|
27
|
+
raise "Invalid strategy: #{strategy}"
|
28
|
+
end
|
29
|
+
next sum
|
20
30
|
end
|
21
31
|
return results if 0 == total_count
|
22
32
|
|
23
33
|
@groups.each do |name, group|
|
24
34
|
count = group.count(entry)
|
35
|
+
if strategy==:sum
|
36
|
+
# keep count
|
37
|
+
elsif strategy==:sqrt
|
38
|
+
count = Math::sqrt(count)
|
39
|
+
elsif strategy==:log
|
40
|
+
count = Math::log10(count) if count > 0
|
41
|
+
else
|
42
|
+
raise "Invalid strategy: #{strategy}"
|
43
|
+
end
|
25
44
|
results[name] = count > 0 ? count.to_f / total_count : 0.0
|
26
45
|
end
|
27
46
|
return results
|
28
47
|
end
|
29
48
|
|
30
49
|
# Classify a text by taking the average of all word classifications.
|
31
|
-
def classify_text(words)
|
50
|
+
def classify_text(words, strategy=:sum)
|
51
|
+
hits = 0
|
32
52
|
group_score_sums = words.inject({}) do |results, word|
|
33
|
-
word_results = classify(word)
|
53
|
+
word_results = classify(word, strategy)
|
54
|
+
next results if word_results.empty?
|
55
|
+
hits += 1
|
34
56
|
results.merge(word_results) do |key, old, new|
|
35
57
|
old + new
|
36
58
|
end
|
37
59
|
end
|
38
60
|
|
39
|
-
words_count = words.size.to_f
|
40
61
|
averages={}
|
41
62
|
group_score_sums.each do |group, sum|
|
42
|
-
averages[group] = sum /
|
63
|
+
averages[group] = hits > 0 ? sum / hits : 0
|
43
64
|
end
|
44
65
|
|
45
66
|
averages
|
46
67
|
end
|
47
|
-
|
68
|
+
|
48
69
|
def self.version
|
49
70
|
File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
|
50
71
|
end
|
data/spec/groupie_spec.rb
CHANGED
@@ -45,7 +45,37 @@ describe Groupie do
|
|
45
45
|
c2 = g.classify('user')
|
46
46
|
c2[:ham].should > c2[:spam]
|
47
47
|
end
|
48
|
+
|
49
|
+
describe "strategies" do
|
50
|
+
describe "sum" do
|
51
|
+
it "should weigh words for the sum of their occurances" do
|
52
|
+
g = Groupie.new
|
53
|
+
g[:spam].add %w[word] * 9
|
54
|
+
g[:ham].add %w[word]
|
55
|
+
g.classify('word', :sum).should == {:spam=>0.9, :ham=>0.1}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "sqrt" do
|
60
|
+
it "should weigh words for the square root of the sum of ocurances" do
|
61
|
+
g = Groupie.new
|
62
|
+
g[:spam].add %w[word] * 9
|
63
|
+
g[:ham].add %w[word]
|
64
|
+
g.classify('word', :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe "log" do
|
69
|
+
it "should weigh words for log10 of their sum of occurances" do
|
70
|
+
g = Groupie.new
|
71
|
+
g[:spam].add %w[word] * 1000
|
72
|
+
g[:ham].add %w[word] * 10
|
73
|
+
g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
48
77
|
end
|
78
|
+
|
49
79
|
context "classify_text" do
|
50
80
|
it 'should tokenized html emails' do
|
51
81
|
g = Groupie.new
|
@@ -71,5 +101,30 @@ describe Groupie do
|
|
71
101
|
result2 = g.classify_text "Grow flowers to give to your mom".tokenize
|
72
102
|
result2[:ham].should == result2[:spam]
|
73
103
|
end
|
104
|
+
|
105
|
+
it "should skip unknown tokens" do
|
106
|
+
g = Groupie.new
|
107
|
+
g[:spam].add %w[buy viagra now]
|
108
|
+
g[:ham].add %w[buy flowers now]
|
109
|
+
g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
|
110
|
+
end
|
111
|
+
|
112
|
+
it "should support the sqrt strategy" do
|
113
|
+
g = Groupie.new
|
114
|
+
g[:spam].add %w[one] * 9
|
115
|
+
g[:ham].add %w[one]
|
116
|
+
g[:spam].add %w[two] * 9
|
117
|
+
g[:ham].add %w[two]
|
118
|
+
g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should support the log strategy" do
|
122
|
+
g = Groupie.new
|
123
|
+
g[:spam].add %w[one] * 100
|
124
|
+
g[:ham].add %w[one]
|
125
|
+
g[:spam].add %w[two]
|
126
|
+
g[:ham].add %w[two] * 100
|
127
|
+
g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
|
128
|
+
end
|
74
129
|
end
|
75
130
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: groupie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Wes Oldenbeuving
|