groupie 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ *.gemspec
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.3.0
@@ -12,16 +12,26 @@ class Groupie
12
12
  @groups[group] ||= Group.new(group)
13
13
  end
14
14
 
15
+ def unique_words
16
+ @unique_words ||= (
17
+ total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
18
+ median_index = [total_count.values.size * 3 / 4 - 1, 1].max
19
+ median_frequency = total_count.values.sort[median_index]
20
+ total_count.select{|word, count| count <= median_frequency}.map(&:first)
21
+ )
22
+ end
23
+
15
24
  def classify(entry, strategy=:sum)
16
25
  results = {}
17
26
  total_count = @groups.inject(0) do |sum, name_group|
18
27
  group = name_group.last
19
28
  count = group.count(entry)
20
- if strategy==:sum
29
+ case strategy
30
+ when :sum
21
31
  sum += count
22
- elsif strategy==:sqrt
32
+ when :sqrt, :unique
23
33
  sum += Math::sqrt(count)
24
- elsif strategy==:log
34
+ when :log
25
35
  sum += Math::log10(count) if count > 0
26
36
  else
27
37
  raise "Invalid strategy: #{strategy}"
@@ -32,11 +42,12 @@ class Groupie
32
42
 
33
43
  @groups.each do |name, group|
34
44
  count = group.count(entry)
35
- if strategy==:sum
45
+ case strategy
46
+ when :sum
36
47
  # keep count
37
- elsif strategy==:sqrt
48
+ when :sqrt, :unique
38
49
  count = Math::sqrt(count)
39
- elsif strategy==:log
50
+ when :log
40
51
  count = Math::log10(count) if count > 0
41
52
  else
42
53
  raise "Invalid strategy: #{strategy}"
@@ -49,6 +60,9 @@ class Groupie
49
60
  # Classify a text by taking the average of all word classifications.
50
61
  def classify_text(words, strategy=:sum)
51
62
  hits = 0
63
+ if strategy==:unique
64
+ words = words & unique_words
65
+ end
52
66
  group_score_sums = words.inject({}) do |results, word|
53
67
  word_results = classify(word, strategy)
54
68
  next results if word_results.empty?
@@ -1,10 +1,11 @@
1
1
  class Groupie
2
2
  class Group
3
+ attr_reader :word_counts
3
4
  def initialize(name)
4
5
  @name = name
5
6
  @word_counts = {}
6
7
  end
7
-
8
+
8
9
  def words
9
10
  @word_counts.keys
10
11
  end
@@ -16,6 +17,7 @@ class Groupie
16
17
  end
17
18
  nil
18
19
  end
20
+ alias << add
19
21
 
20
22
  # Return the count for a specific +word+.
21
23
  def count(word)
@@ -4,11 +4,7 @@ Groupie is a simple way to group texts and classify new texts as being a likely
4
4
 
5
5
  The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
6
6
 
7
- == Goals
8
-
9
- Groupie is a 'fun' project that has the following goals, in descending order of importance:
10
- * Have fun playing with code
11
- * Play with Bayesian-like (spam) filtering
7
+ Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
12
8
 
13
9
  == Current functionality
14
10
 
@@ -19,6 +15,7 @@ Current funcionality includes:
19
15
  * Add texts (as an Array of Strings) to any number of groups.
20
16
  * Classify a single word to check the likelihood it belongs to each group.
21
17
  * Do classification for complete (tokenized) texts.
18
+ * Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
22
19
 
23
20
  == License
24
21
 
@@ -2,6 +2,31 @@ require File.join(File.dirname(__FILE__), %w[.. spec_helper])
2
2
  require 'yaml'
3
3
 
4
4
  describe Groupie::Group do
5
+ describe "add" do
6
+ before(:each) do
7
+ @group = Groupie::Group.new("test")
8
+ end
9
+
10
+ it "should accept a single string" do
11
+ @group.add "bla"
12
+ @group.words.should == %w[bla]
13
+ end
14
+
15
+ it "should accept an Array of strings" do
16
+ @group.add ["bla", "bla2"]
17
+ @group.words.should == %w[bla bla2]
18
+ end
19
+
20
+ it "should accept multiple strings" do
21
+ @group.add "bla", "bla2"
22
+ @group.words.should == %w[bla bla2]
23
+ end
24
+
25
+ it "should be aliased as <<" do
26
+ @group.method(:add).should == @group.method(:<<)
27
+ end
28
+ end
29
+
5
30
  it "can be serialized and loaded through YAML" do
6
31
  group = Groupie::Group.new 'group'
7
32
  group.add %w[buy flowers]
@@ -73,6 +73,32 @@ describe Groupie do
73
73
  g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
74
74
  end
75
75
  end
76
+
77
+ describe "unique" do
78
+ it "should should behave as sqrt strategy" do
79
+ g = Groupie.new
80
+ g[:spam].add %w[buy viagra now]
81
+ g[:ham].add %w[buy flowers now]
82
+ g.classify('buy', :unique).should == g.classify('buy', :sqrt)
83
+ g.classify('flowers', :unique).should == g.classify('flowers', :sqrt)
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ describe "unique_words" do
90
+ it "should exclude all words in the 4th quintile of all groups" do
91
+ g = Groupie.new
92
+ g[:spam].add %w[one two two three three three four four four four]
93
+ g[:ham].add %w[apple banana pear orange three]
94
+ g.unique_words.sort.should == %w[one two apple banana pear orange].sort
95
+ end
96
+
97
+ it "should work on an empty word set" do
98
+ g = Groupie.new
99
+ g[:spam].add []
100
+ g[:ham].add []
101
+ g.unique_words.should == []
76
102
  end
77
103
  end
78
104
 
@@ -108,7 +134,7 @@ describe Groupie do
108
134
  g[:ham].add %w[buy flowers now]
109
135
  g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
110
136
  end
111
-
137
+
112
138
  it "should support the sqrt strategy" do
113
139
  g = Groupie.new
114
140
  g[:spam].add %w[one] * 9
@@ -117,7 +143,7 @@ describe Groupie do
117
143
  g[:ham].add %w[two]
118
144
  g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
119
145
  end
120
-
146
+
121
147
  it "should support the log strategy" do
122
148
  g = Groupie.new
123
149
  g[:spam].add %w[one] * 100
@@ -126,5 +152,12 @@ describe Groupie do
126
152
  g[:ham].add %w[two] * 100
127
153
  g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
128
154
  end
155
+
156
+ it "should only rate unique words for the unique strategy" do
157
+ g = Groupie.new
158
+ g[:spam].add %w[one two two three three three four four four four]
159
+ g[:ham].add %w[apple banana pear]
160
+ g.classify_text(%w[one two three apple banana], :unique).should == {:spam=>0.5, :ham=>0.5}
161
+ end
129
162
  end
130
163
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 2
10
- version: 0.2.2
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Wes Oldenbeuving
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-25 00:00:00 +02:00
18
+ date: 2010-07-29 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,10 +42,10 @@ extra_rdoc_files:
42
42
  - LICENSE
43
43
  files:
44
44
  - .document
45
+ - .gitignore
45
46
  - LICENSE
46
47
  - Rakefile
47
48
  - VERSION
48
- - groupie.gemspec
49
49
  - lib/groupie.rb
50
50
  - lib/groupie/core_ext/string.rb
51
51
  - lib/groupie/group.rb
@@ -1,63 +0,0 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
- # -*- encoding: utf-8 -*-
5
-
6
- Gem::Specification.new do |s|
7
- s.name = %q{groupie}
8
- s.version = "0.2.2"
9
-
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Wes Oldenbeuving"]
12
- s.date = %q{2010-07-25}
13
- s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
14
- s.email = %q{narnach@gmail.com}
15
- s.extra_rdoc_files = [
16
- "LICENSE"
17
- ]
18
- s.files = [
19
- ".document",
20
- "LICENSE",
21
- "Rakefile",
22
- "VERSION",
23
- "groupie.gemspec",
24
- "lib/groupie.rb",
25
- "lib/groupie/core_ext/string.rb",
26
- "lib/groupie/group.rb",
27
- "readme.rdoc",
28
- "spec/fixtures/ham/email_ham1.txt",
29
- "spec/fixtures/ham/spam.la-44116217.txt",
30
- "spec/fixtures/spam/email_spam1.txt",
31
- "spec/fixtures/spam/email_spam2.txt",
32
- "spec/fixtures/spam/spam.la-44118014.txt",
33
- "spec/groupie/core_ext/string_spec.rb",
34
- "spec/groupie/group_spec.rb",
35
- "spec/groupie_spec.rb",
36
- "spec/spec_helper.rb"
37
- ]
38
- s.homepage = %q{http://github.com/Narnach/groupie}
39
- s.rdoc_options = ["--charset=UTF-8"]
40
- s.require_paths = ["lib"]
41
- s.rubygems_version = %q{1.3.7}
42
- s.summary = %q{Group and classify text}
43
- s.test_files = [
44
- "spec/groupie/core_ext/string_spec.rb",
45
- "spec/groupie/group_spec.rb",
46
- "spec/groupie_spec.rb",
47
- "spec/spec_helper.rb"
48
- ]
49
-
50
- if s.respond_to? :specification_version then
51
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
52
- s.specification_version = 3
53
-
54
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55
- s.add_development_dependency(%q<testy>, [">= 0"])
56
- else
57
- s.add_dependency(%q<testy>, [">= 0"])
58
- end
59
- else
60
- s.add_dependency(%q<testy>, [">= 0"])
61
- end
62
- end
63
-