groupie 0.1.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.2
data/groupie.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{groupie}
8
- s.version = "0.1.1"
8
+ s.version = "0.2.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Wes Oldenbeuving"]
data/lib/groupie.rb CHANGED
@@ -12,39 +12,60 @@ class Groupie
12
12
  @groups[group] ||= Group.new(group)
13
13
  end
14
14
 
15
- def classify(entry)
15
+ def classify(entry, strategy=:sum)
16
16
  results = {}
17
17
  total_count = @groups.inject(0) do |sum, name_group|
18
18
  group = name_group.last
19
- sum + group.count(entry)
19
+ count = group.count(entry)
20
+ if strategy==:sum
21
+ sum += count
22
+ elsif strategy==:sqrt
23
+ sum += Math::sqrt(count)
24
+ elsif strategy==:log
25
+ sum += Math::log10(count) if count > 0
26
+ else
27
+ raise "Invalid strategy: #{strategy}"
28
+ end
29
+ next sum
20
30
  end
21
31
  return results if 0 == total_count
22
32
 
23
33
  @groups.each do |name, group|
24
34
  count = group.count(entry)
35
+ if strategy==:sum
36
+ # keep count
37
+ elsif strategy==:sqrt
38
+ count = Math::sqrt(count)
39
+ elsif strategy==:log
40
+ count = Math::log10(count) if count > 0
41
+ else
42
+ raise "Invalid strategy: #{strategy}"
43
+ end
25
44
  results[name] = count > 0 ? count.to_f / total_count : 0.0
26
45
  end
27
46
  return results
28
47
  end
29
48
 
30
49
  # Classify a text by taking the average of all word classifications.
31
- def classify_text(words)
50
+ def classify_text(words, strategy=:sum)
51
+ hits = 0
32
52
  group_score_sums = words.inject({}) do |results, word|
33
- word_results = classify(word)
53
+ word_results = classify(word, strategy)
54
+ next results if word_results.empty?
55
+ hits += 1
34
56
  results.merge(word_results) do |key, old, new|
35
57
  old + new
36
58
  end
37
59
  end
38
60
 
39
- words_count = words.size.to_f
40
61
  averages={}
41
62
  group_score_sums.each do |group, sum|
42
- averages[group] = sum / words_count
63
+ averages[group] = hits > 0 ? sum / hits : 0
43
64
  end
44
65
 
45
66
  averages
46
67
  end
47
-
68
+
48
69
  def self.version
49
70
  File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
50
71
  end
data/spec/groupie_spec.rb CHANGED
@@ -45,7 +45,37 @@ describe Groupie do
45
45
  c2 = g.classify('user')
46
46
  c2[:ham].should > c2[:spam]
47
47
  end
48
+
49
+ describe "strategies" do
50
+ describe "sum" do
51
+ it "should weigh words for the sum of their occurances" do
52
+ g = Groupie.new
53
+ g[:spam].add %w[word] * 9
54
+ g[:ham].add %w[word]
55
+ g.classify('word', :sum).should == {:spam=>0.9, :ham=>0.1}
56
+ end
57
+ end
58
+
59
+ describe "sqrt" do
60
+ it "should weigh words for the square root of the sum of ocurances" do
61
+ g = Groupie.new
62
+ g[:spam].add %w[word] * 9
63
+ g[:ham].add %w[word]
64
+ g.classify('word', :sqrt).should == {:spam=>0.75, :ham=>0.25}
65
+ end
66
+ end
67
+
68
+ describe "log" do
69
+ it "should weigh words for log10 of their sum of occurances" do
70
+ g = Groupie.new
71
+ g[:spam].add %w[word] * 1000
72
+ g[:ham].add %w[word] * 10
73
+ g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
74
+ end
75
+ end
76
+ end
48
77
  end
78
+
49
79
  context "classify_text" do
50
80
  it 'should tokenized html emails' do
51
81
  g = Groupie.new
@@ -71,5 +101,30 @@ describe Groupie do
71
101
  result2 = g.classify_text "Grow flowers to give to your mom".tokenize
72
102
  result2[:ham].should == result2[:spam]
73
103
  end
104
+
105
+ it "should skip unknown tokens" do
106
+ g = Groupie.new
107
+ g[:spam].add %w[buy viagra now]
108
+ g[:ham].add %w[buy flowers now]
109
+ g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
110
+ end
111
+
112
+ it "should support the sqrt strategy" do
113
+ g = Groupie.new
114
+ g[:spam].add %w[one] * 9
115
+ g[:ham].add %w[one]
116
+ g[:spam].add %w[two] * 9
117
+ g[:ham].add %w[two]
118
+ g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
119
+ end
120
+
121
+ it "should support the log strategy" do
122
+ g = Groupie.new
123
+ g[:spam].add %w[one] * 100
124
+ g[:ham].add %w[one]
125
+ g[:spam].add %w[two]
126
+ g[:ham].add %w[two] * 100
127
+ g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
128
+ end
74
129
  end
75
130
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Wes Oldenbeuving