groupie 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ *.gemspec
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.3.0
@@ -12,16 +12,26 @@ class Groupie
12
12
  @groups[group] ||= Group.new(group)
13
13
  end
14
14
 
15
+ def unique_words
16
+ @unique_words ||= (
17
+ total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
18
+ median_index = [total_count.values.size * 3 / 4 - 1, 1].max
19
+ median_frequency = total_count.values.sort[median_index]
20
+ total_count.select{|word, count| count <= median_frequency}.map(&:first)
21
+ )
22
+ end
23
+
15
24
  def classify(entry, strategy=:sum)
16
25
  results = {}
17
26
  total_count = @groups.inject(0) do |sum, name_group|
18
27
  group = name_group.last
19
28
  count = group.count(entry)
20
- if strategy==:sum
29
+ case strategy
30
+ when :sum
21
31
  sum += count
22
- elsif strategy==:sqrt
32
+ when :sqrt, :unique
23
33
  sum += Math::sqrt(count)
24
- elsif strategy==:log
34
+ when :log
25
35
  sum += Math::log10(count) if count > 0
26
36
  else
27
37
  raise "Invalid strategy: #{strategy}"
@@ -32,11 +42,12 @@ class Groupie
32
42
 
33
43
  @groups.each do |name, group|
34
44
  count = group.count(entry)
35
- if strategy==:sum
45
+ case strategy
46
+ when :sum
36
47
  # keep count
37
- elsif strategy==:sqrt
48
+ when :sqrt, :unique
38
49
  count = Math::sqrt(count)
39
- elsif strategy==:log
50
+ when :log
40
51
  count = Math::log10(count) if count > 0
41
52
  else
42
53
  raise "Invalid strategy: #{strategy}"
@@ -49,6 +60,9 @@ class Groupie
49
60
  # Classify a text by taking the average of all word classifications.
50
61
  def classify_text(words, strategy=:sum)
51
62
  hits = 0
63
+ if strategy==:unique
64
+ words = words & unique_words
65
+ end
52
66
  group_score_sums = words.inject({}) do |results, word|
53
67
  word_results = classify(word, strategy)
54
68
  next results if word_results.empty?
@@ -1,10 +1,11 @@
1
1
  class Groupie
2
2
  class Group
3
+ attr_reader :word_counts
3
4
  def initialize(name)
4
5
  @name = name
5
6
  @word_counts = {}
6
7
  end
7
-
8
+
8
9
  def words
9
10
  @word_counts.keys
10
11
  end
@@ -16,6 +17,7 @@ class Groupie
16
17
  end
17
18
  nil
18
19
  end
20
+ alias << add
19
21
 
20
22
  # Return the count for a specific +word+.
21
23
  def count(word)
@@ -4,11 +4,7 @@ Groupie is a simple way to group texts and classify new texts as being a likely
4
4
 
5
5
  The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
6
6
 
7
- == Goals
8
-
9
- Groupie is a 'fun' project that has the following goals, in descending order of importance:
10
- * Have fun playing with code
11
- * Play with Bayesian-like (spam) filtering
7
+ Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
12
8
 
13
9
  == Current functionality
14
10
 
@@ -19,6 +15,7 @@ Current funcionality includes:
19
15
  * Add texts (as an Array of Strings) to any number of groups.
20
16
  * Classify a single word to check the likelihood it belongs to each group.
21
17
  * Do classification for complete (tokenized) texts.
18
+ * Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
22
19
 
23
20
  == License
24
21
 
@@ -2,6 +2,31 @@ require File.join(File.dirname(__FILE__), %w[.. spec_helper])
2
2
  require 'yaml'
3
3
 
4
4
  describe Groupie::Group do
5
+ describe "add" do
6
+ before(:each) do
7
+ @group = Groupie::Group.new("test")
8
+ end
9
+
10
+ it "should accept a single string" do
11
+ @group.add "bla"
12
+ @group.words.should == %w[bla]
13
+ end
14
+
15
+ it "should accept an Array of strings" do
16
+ @group.add ["bla", "bla2"]
17
+ @group.words.should == %w[bla bla2]
18
+ end
19
+
20
+ it "should accept multiple strings" do
21
+ @group.add "bla", "bla2"
22
+ @group.words.should == %w[bla bla2]
23
+ end
24
+
25
+ it "should be aliased as <<" do
26
+ @group.method(:add).should == @group.method(:<<)
27
+ end
28
+ end
29
+
5
30
  it "can be serialized and loaded through YAML" do
6
31
  group = Groupie::Group.new 'group'
7
32
  group.add %w[buy flowers]
@@ -73,6 +73,32 @@ describe Groupie do
73
73
  g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
74
74
  end
75
75
  end
76
+
77
+ describe "unique" do
78
+ it "should should behave as sqrt strategy" do
79
+ g = Groupie.new
80
+ g[:spam].add %w[buy viagra now]
81
+ g[:ham].add %w[buy flowers now]
82
+ g.classify('buy', :unique).should == g.classify('buy', :sqrt)
83
+ g.classify('flowers', :unique).should == g.classify('flowers', :sqrt)
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ describe "unique_words" do
90
+ it "should exclude all words in the 4th quintile of all groups" do
91
+ g = Groupie.new
92
+ g[:spam].add %w[one two two three three three four four four four]
93
+ g[:ham].add %w[apple banana pear orange three]
94
+ g.unique_words.sort.should == %w[one two apple banana pear orange].sort
95
+ end
96
+
97
+ it "should work on an empty word set" do
98
+ g = Groupie.new
99
+ g[:spam].add []
100
+ g[:ham].add []
101
+ g.unique_words.should == []
76
102
  end
77
103
  end
78
104
 
@@ -108,7 +134,7 @@ describe Groupie do
108
134
  g[:ham].add %w[buy flowers now]
109
135
  g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
110
136
  end
111
-
137
+
112
138
  it "should support the sqrt strategy" do
113
139
  g = Groupie.new
114
140
  g[:spam].add %w[one] * 9
@@ -117,7 +143,7 @@ describe Groupie do
117
143
  g[:ham].add %w[two]
118
144
  g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
119
145
  end
120
-
146
+
121
147
  it "should support the log strategy" do
122
148
  g = Groupie.new
123
149
  g[:spam].add %w[one] * 100
@@ -126,5 +152,12 @@ describe Groupie do
126
152
  g[:ham].add %w[two] * 100
127
153
  g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
128
154
  end
155
+
156
+ it "should only rate unique words for the unique strategy" do
157
+ g = Groupie.new
158
+ g[:spam].add %w[one two two three three three four four four four]
159
+ g[:ham].add %w[apple banana pear]
160
+ g.classify_text(%w[one two three apple banana], :unique).should == {:spam=>0.5, :ham=>0.5}
161
+ end
129
162
  end
130
163
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 2
10
- version: 0.2.2
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Wes Oldenbeuving
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-25 00:00:00 +02:00
18
+ date: 2010-07-29 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,10 +42,10 @@ extra_rdoc_files:
42
42
  - LICENSE
43
43
  files:
44
44
  - .document
45
+ - .gitignore
45
46
  - LICENSE
46
47
  - Rakefile
47
48
  - VERSION
48
- - groupie.gemspec
49
49
  - lib/groupie.rb
50
50
  - lib/groupie/core_ext/string.rb
51
51
  - lib/groupie/group.rb
@@ -1,63 +0,0 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
- # -*- encoding: utf-8 -*-
5
-
6
- Gem::Specification.new do |s|
7
- s.name = %q{groupie}
8
- s.version = "0.2.2"
9
-
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Wes Oldenbeuving"]
12
- s.date = %q{2010-07-25}
13
- s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
14
- s.email = %q{narnach@gmail.com}
15
- s.extra_rdoc_files = [
16
- "LICENSE"
17
- ]
18
- s.files = [
19
- ".document",
20
- "LICENSE",
21
- "Rakefile",
22
- "VERSION",
23
- "groupie.gemspec",
24
- "lib/groupie.rb",
25
- "lib/groupie/core_ext/string.rb",
26
- "lib/groupie/group.rb",
27
- "readme.rdoc",
28
- "spec/fixtures/ham/email_ham1.txt",
29
- "spec/fixtures/ham/spam.la-44116217.txt",
30
- "spec/fixtures/spam/email_spam1.txt",
31
- "spec/fixtures/spam/email_spam2.txt",
32
- "spec/fixtures/spam/spam.la-44118014.txt",
33
- "spec/groupie/core_ext/string_spec.rb",
34
- "spec/groupie/group_spec.rb",
35
- "spec/groupie_spec.rb",
36
- "spec/spec_helper.rb"
37
- ]
38
- s.homepage = %q{http://github.com/Narnach/groupie}
39
- s.rdoc_options = ["--charset=UTF-8"]
40
- s.require_paths = ["lib"]
41
- s.rubygems_version = %q{1.3.7}
42
- s.summary = %q{Group and classify text}
43
- s.test_files = [
44
- "spec/groupie/core_ext/string_spec.rb",
45
- "spec/groupie/group_spec.rb",
46
- "spec/groupie_spec.rb",
47
- "spec/spec_helper.rb"
48
- ]
49
-
50
- if s.respond_to? :specification_version then
51
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
52
- s.specification_version = 3
53
-
54
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55
- s.add_development_dependency(%q<testy>, [">= 0"])
56
- else
57
- s.add_dependency(%q<testy>, [">= 0"])
58
- end
59
- else
60
- s.add_dependency(%q<testy>, [">= 0"])
61
- end
62
- end
63
-