groupie 0.1.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.2
data/groupie.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{groupie}
8
- s.version = "0.1.1"
8
+ s.version = "0.2.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Wes Oldenbeuving"]
data/lib/groupie.rb CHANGED
@@ -12,39 +12,60 @@ class Groupie
12
12
  @groups[group] ||= Group.new(group)
13
13
  end
14
14
 
15
- def classify(entry)
15
+ def classify(entry, strategy=:sum)
16
16
  results = {}
17
17
  total_count = @groups.inject(0) do |sum, name_group|
18
18
  group = name_group.last
19
- sum + group.count(entry)
19
+ count = group.count(entry)
20
+ if strategy==:sum
21
+ sum += count
22
+ elsif strategy==:sqrt
23
+ sum += Math::sqrt(count)
24
+ elsif strategy==:log
25
+ sum += Math::log10(count) if count > 0
26
+ else
27
+ raise "Invalid strategy: #{strategy}"
28
+ end
29
+ next sum
20
30
  end
21
31
  return results if 0 == total_count
22
32
 
23
33
  @groups.each do |name, group|
24
34
  count = group.count(entry)
35
+ if strategy==:sum
36
+ # keep count
37
+ elsif strategy==:sqrt
38
+ count = Math::sqrt(count)
39
+ elsif strategy==:log
40
+ count = Math::log10(count) if count > 0
41
+ else
42
+ raise "Invalid strategy: #{strategy}"
43
+ end
25
44
  results[name] = count > 0 ? count.to_f / total_count : 0.0
26
45
  end
27
46
  return results
28
47
  end
29
48
 
30
49
  # Classify a text by taking the average of all word classifications.
31
- def classify_text(words)
50
+ def classify_text(words, strategy=:sum)
51
+ hits = 0
32
52
  group_score_sums = words.inject({}) do |results, word|
33
- word_results = classify(word)
53
+ word_results = classify(word, strategy)
54
+ next results if word_results.empty?
55
+ hits += 1
34
56
  results.merge(word_results) do |key, old, new|
35
57
  old + new
36
58
  end
37
59
  end
38
60
 
39
- words_count = words.size.to_f
40
61
  averages={}
41
62
  group_score_sums.each do |group, sum|
42
- averages[group] = sum / words_count
63
+ averages[group] = hits > 0 ? sum / hits : 0
43
64
  end
44
65
 
45
66
  averages
46
67
  end
47
-
68
+
48
69
  def self.version
49
70
  File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
50
71
  end
data/spec/groupie_spec.rb CHANGED
@@ -45,7 +45,37 @@ describe Groupie do
45
45
  c2 = g.classify('user')
46
46
  c2[:ham].should > c2[:spam]
47
47
  end
48
+
49
+ describe "strategies" do
50
+ describe "sum" do
51
+ it "should weigh words for the sum of their occurances" do
52
+ g = Groupie.new
53
+ g[:spam].add %w[word] * 9
54
+ g[:ham].add %w[word]
55
+ g.classify('word', :sum).should == {:spam=>0.9, :ham=>0.1}
56
+ end
57
+ end
58
+
59
+ describe "sqrt" do
60
+ it "should weigh words for the square root of the sum of ocurances" do
61
+ g = Groupie.new
62
+ g[:spam].add %w[word] * 9
63
+ g[:ham].add %w[word]
64
+ g.classify('word', :sqrt).should == {:spam=>0.75, :ham=>0.25}
65
+ end
66
+ end
67
+
68
+ describe "log" do
69
+ it "should weigh words for log10 of their sum of occurances" do
70
+ g = Groupie.new
71
+ g[:spam].add %w[word] * 1000
72
+ g[:ham].add %w[word] * 10
73
+ g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
74
+ end
75
+ end
76
+ end
48
77
  end
78
+
49
79
  context "classify_text" do
50
80
  it 'should tokenized html emails' do
51
81
  g = Groupie.new
@@ -71,5 +101,30 @@ describe Groupie do
71
101
  result2 = g.classify_text "Grow flowers to give to your mom".tokenize
72
102
  result2[:ham].should == result2[:spam]
73
103
  end
104
+
105
+ it "should skip unknown tokens" do
106
+ g = Groupie.new
107
+ g[:spam].add %w[buy viagra now]
108
+ g[:ham].add %w[buy flowers now]
109
+ g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
110
+ end
111
+
112
+ it "should support the sqrt strategy" do
113
+ g = Groupie.new
114
+ g[:spam].add %w[one] * 9
115
+ g[:ham].add %w[one]
116
+ g[:spam].add %w[two] * 9
117
+ g[:ham].add %w[two]
118
+ g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
119
+ end
120
+
121
+ it "should support the log strategy" do
122
+ g = Groupie.new
123
+ g[:spam].add %w[one] * 100
124
+ g[:ham].add %w[one]
125
+ g[:spam].add %w[two]
126
+ g[:ham].add %w[two] * 100
127
+ g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
128
+ end
74
129
  end
75
130
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Wes Oldenbeuving