groupie 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/VERSION +1 -1
- data/lib/groupie.rb +20 -6
- data/lib/groupie/group.rb +3 -1
- data/readme.rdoc +2 -5
- data/spec/groupie/group_spec.rb +25 -0
- data/spec/groupie_spec.rb +35 -2
- metadata +5 -5
- data/groupie.gemspec +0 -63
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gemspec
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/groupie.rb
CHANGED
@@ -12,16 +12,26 @@ class Groupie
|
|
12
12
|
@groups[group] ||= Group.new(group)
|
13
13
|
end
|
14
14
|
|
15
|
+
def unique_words
|
16
|
+
@unique_words ||= (
|
17
|
+
total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
|
18
|
+
median_index = [total_count.values.size * 3 / 4 - 1, 1].max
|
19
|
+
median_frequency = total_count.values.sort[median_index]
|
20
|
+
total_count.select{|word, count| count <= median_frequency}.map(&:first)
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
15
24
|
def classify(entry, strategy=:sum)
|
16
25
|
results = {}
|
17
26
|
total_count = @groups.inject(0) do |sum, name_group|
|
18
27
|
group = name_group.last
|
19
28
|
count = group.count(entry)
|
20
|
-
|
29
|
+
case strategy
|
30
|
+
when :sum
|
21
31
|
sum += count
|
22
|
-
|
32
|
+
when :sqrt, :unique
|
23
33
|
sum += Math::sqrt(count)
|
24
|
-
|
34
|
+
when :log
|
25
35
|
sum += Math::log10(count) if count > 0
|
26
36
|
else
|
27
37
|
raise "Invalid strategy: #{strategy}"
|
@@ -32,11 +42,12 @@ class Groupie
|
|
32
42
|
|
33
43
|
@groups.each do |name, group|
|
34
44
|
count = group.count(entry)
|
35
|
-
|
45
|
+
case strategy
|
46
|
+
when :sum
|
36
47
|
# keep count
|
37
|
-
|
48
|
+
when :sqrt, :unique
|
38
49
|
count = Math::sqrt(count)
|
39
|
-
|
50
|
+
when :log
|
40
51
|
count = Math::log10(count) if count > 0
|
41
52
|
else
|
42
53
|
raise "Invalid strategy: #{strategy}"
|
@@ -49,6 +60,9 @@ class Groupie
|
|
49
60
|
# Classify a text by taking the average of all word classifications.
|
50
61
|
def classify_text(words, strategy=:sum)
|
51
62
|
hits = 0
|
63
|
+
if strategy==:unique
|
64
|
+
words = words & unique_words
|
65
|
+
end
|
52
66
|
group_score_sums = words.inject({}) do |results, word|
|
53
67
|
word_results = classify(word, strategy)
|
54
68
|
next results if word_results.empty?
|
data/lib/groupie/group.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
class Groupie
|
2
2
|
class Group
|
3
|
+
attr_reader :word_counts
|
3
4
|
def initialize(name)
|
4
5
|
@name = name
|
5
6
|
@word_counts = {}
|
6
7
|
end
|
7
|
-
|
8
|
+
|
8
9
|
def words
|
9
10
|
@word_counts.keys
|
10
11
|
end
|
@@ -16,6 +17,7 @@ class Groupie
|
|
16
17
|
end
|
17
18
|
nil
|
18
19
|
end
|
20
|
+
alias << add
|
19
21
|
|
20
22
|
# Return the count for a specific +word+.
|
21
23
|
def count(word)
|
data/readme.rdoc
CHANGED
@@ -4,11 +4,7 @@ Groupie is a simple way to group texts and classify new texts as being a likely
|
|
4
4
|
|
5
5
|
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
Groupie is a 'fun' project that has the following goals, in descending order of importance:
|
10
|
-
* Have fun playing with code
|
11
|
-
* Play with Bayesian-like (spam) filtering
|
7
|
+
Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
|
12
8
|
|
13
9
|
== Current functionality
|
14
10
|
|
@@ -19,6 +15,7 @@ Current funcionality includes:
|
|
19
15
|
* Add texts (as an Array of Strings) to any number of groups.
|
20
16
|
* Classify a single word to check the likelihood it belongs to each group.
|
21
17
|
* Do classification for complete (tokenized) texts.
|
18
|
+
* Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
|
22
19
|
|
23
20
|
== License
|
24
21
|
|
data/spec/groupie/group_spec.rb
CHANGED
@@ -2,6 +2,31 @@ require File.join(File.dirname(__FILE__), %w[.. spec_helper])
|
|
2
2
|
require 'yaml'
|
3
3
|
|
4
4
|
describe Groupie::Group do
|
5
|
+
describe "add" do
|
6
|
+
before(:each) do
|
7
|
+
@group = Groupie::Group.new("test")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should accept a single string" do
|
11
|
+
@group.add "bla"
|
12
|
+
@group.words.should == %w[bla]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should accept an Array of strings" do
|
16
|
+
@group.add ["bla", "bla2"]
|
17
|
+
@group.words.should == %w[bla bla2]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should accept multiple strings" do
|
21
|
+
@group.add "bla", "bla2"
|
22
|
+
@group.words.should == %w[bla bla2]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be aliased as <<" do
|
26
|
+
@group.method(:add).should == @group.method(:<<)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
5
30
|
it "can be serialized and loaded through YAML" do
|
6
31
|
group = Groupie::Group.new 'group'
|
7
32
|
group.add %w[buy flowers]
|
data/spec/groupie_spec.rb
CHANGED
@@ -73,6 +73,32 @@ describe Groupie do
|
|
73
73
|
g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
describe "unique" do
|
78
|
+
it "should should behave as sqrt strategy" do
|
79
|
+
g = Groupie.new
|
80
|
+
g[:spam].add %w[buy viagra now]
|
81
|
+
g[:ham].add %w[buy flowers now]
|
82
|
+
g.classify('buy', :unique).should == g.classify('buy', :sqrt)
|
83
|
+
g.classify('flowers', :unique).should == g.classify('flowers', :sqrt)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "unique_words" do
|
90
|
+
it "should exclude all words in the 4th quintile of all groups" do
|
91
|
+
g = Groupie.new
|
92
|
+
g[:spam].add %w[one two two three three three four four four four]
|
93
|
+
g[:ham].add %w[apple banana pear orange three]
|
94
|
+
g.unique_words.sort.should == %w[one two apple banana pear orange].sort
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should work on an empty word set" do
|
98
|
+
g = Groupie.new
|
99
|
+
g[:spam].add []
|
100
|
+
g[:ham].add []
|
101
|
+
g.unique_words.should == []
|
76
102
|
end
|
77
103
|
end
|
78
104
|
|
@@ -108,7 +134,7 @@ describe Groupie do
|
|
108
134
|
g[:ham].add %w[buy flowers now]
|
109
135
|
g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
|
110
136
|
end
|
111
|
-
|
137
|
+
|
112
138
|
it "should support the sqrt strategy" do
|
113
139
|
g = Groupie.new
|
114
140
|
g[:spam].add %w[one] * 9
|
@@ -117,7 +143,7 @@ describe Groupie do
|
|
117
143
|
g[:ham].add %w[two]
|
118
144
|
g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
119
145
|
end
|
120
|
-
|
146
|
+
|
121
147
|
it "should support the log strategy" do
|
122
148
|
g = Groupie.new
|
123
149
|
g[:spam].add %w[one] * 100
|
@@ -126,5 +152,12 @@ describe Groupie do
|
|
126
152
|
g[:ham].add %w[two] * 100
|
127
153
|
g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
|
128
154
|
end
|
155
|
+
|
156
|
+
it "should only rate unique words for the unique strategy" do
|
157
|
+
g = Groupie.new
|
158
|
+
g[:spam].add %w[one two two three three three four four four four]
|
159
|
+
g[:ham].add %w[apple banana pear]
|
160
|
+
g.classify_text(%w[one two three apple banana], :unique).should == {:spam=>0.5, :ham=>0.5}
|
161
|
+
end
|
129
162
|
end
|
130
163
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Wes Oldenbeuving
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-29 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,10 +42,10 @@ extra_rdoc_files:
|
|
42
42
|
- LICENSE
|
43
43
|
files:
|
44
44
|
- .document
|
45
|
+
- .gitignore
|
45
46
|
- LICENSE
|
46
47
|
- Rakefile
|
47
48
|
- VERSION
|
48
|
-
- groupie.gemspec
|
49
49
|
- lib/groupie.rb
|
50
50
|
- lib/groupie/core_ext/string.rb
|
51
51
|
- lib/groupie/group.rb
|
data/groupie.gemspec
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
-
# -*- encoding: utf-8 -*-
|
5
|
-
|
6
|
-
Gem::Specification.new do |s|
|
7
|
-
s.name = %q{groupie}
|
8
|
-
s.version = "0.2.2"
|
9
|
-
|
10
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["Wes Oldenbeuving"]
|
12
|
-
s.date = %q{2010-07-25}
|
13
|
-
s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
14
|
-
s.email = %q{narnach@gmail.com}
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE"
|
17
|
-
]
|
18
|
-
s.files = [
|
19
|
-
".document",
|
20
|
-
"LICENSE",
|
21
|
-
"Rakefile",
|
22
|
-
"VERSION",
|
23
|
-
"groupie.gemspec",
|
24
|
-
"lib/groupie.rb",
|
25
|
-
"lib/groupie/core_ext/string.rb",
|
26
|
-
"lib/groupie/group.rb",
|
27
|
-
"readme.rdoc",
|
28
|
-
"spec/fixtures/ham/email_ham1.txt",
|
29
|
-
"spec/fixtures/ham/spam.la-44116217.txt",
|
30
|
-
"spec/fixtures/spam/email_spam1.txt",
|
31
|
-
"spec/fixtures/spam/email_spam2.txt",
|
32
|
-
"spec/fixtures/spam/spam.la-44118014.txt",
|
33
|
-
"spec/groupie/core_ext/string_spec.rb",
|
34
|
-
"spec/groupie/group_spec.rb",
|
35
|
-
"spec/groupie_spec.rb",
|
36
|
-
"spec/spec_helper.rb"
|
37
|
-
]
|
38
|
-
s.homepage = %q{http://github.com/Narnach/groupie}
|
39
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
40
|
-
s.require_paths = ["lib"]
|
41
|
-
s.rubygems_version = %q{1.3.7}
|
42
|
-
s.summary = %q{Group and classify text}
|
43
|
-
s.test_files = [
|
44
|
-
"spec/groupie/core_ext/string_spec.rb",
|
45
|
-
"spec/groupie/group_spec.rb",
|
46
|
-
"spec/groupie_spec.rb",
|
47
|
-
"spec/spec_helper.rb"
|
48
|
-
]
|
49
|
-
|
50
|
-
if s.respond_to? :specification_version then
|
51
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
52
|
-
s.specification_version = 3
|
53
|
-
|
54
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
-
s.add_development_dependency(%q<testy>, [">= 0"])
|
56
|
-
else
|
57
|
-
s.add_dependency(%q<testy>, [">= 0"])
|
58
|
-
end
|
59
|
-
else
|
60
|
-
s.add_dependency(%q<testy>, [">= 0"])
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|