groupie 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/VERSION +1 -1
- data/lib/groupie.rb +20 -6
- data/lib/groupie/group.rb +3 -1
- data/readme.rdoc +2 -5
- data/spec/groupie/group_spec.rb +25 -0
- data/spec/groupie_spec.rb +35 -2
- metadata +5 -5
- data/groupie.gemspec +0 -63
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gemspec
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/groupie.rb
CHANGED
@@ -12,16 +12,26 @@ class Groupie
|
|
12
12
|
@groups[group] ||= Group.new(group)
|
13
13
|
end
|
14
14
|
|
15
|
+
def unique_words
|
16
|
+
@unique_words ||= (
|
17
|
+
total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
|
18
|
+
median_index = [total_count.values.size * 3 / 4 - 1, 1].max
|
19
|
+
median_frequency = total_count.values.sort[median_index]
|
20
|
+
total_count.select{|word, count| count <= median_frequency}.map(&:first)
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
15
24
|
def classify(entry, strategy=:sum)
|
16
25
|
results = {}
|
17
26
|
total_count = @groups.inject(0) do |sum, name_group|
|
18
27
|
group = name_group.last
|
19
28
|
count = group.count(entry)
|
20
|
-
|
29
|
+
case strategy
|
30
|
+
when :sum
|
21
31
|
sum += count
|
22
|
-
|
32
|
+
when :sqrt, :unique
|
23
33
|
sum += Math::sqrt(count)
|
24
|
-
|
34
|
+
when :log
|
25
35
|
sum += Math::log10(count) if count > 0
|
26
36
|
else
|
27
37
|
raise "Invalid strategy: #{strategy}"
|
@@ -32,11 +42,12 @@ class Groupie
|
|
32
42
|
|
33
43
|
@groups.each do |name, group|
|
34
44
|
count = group.count(entry)
|
35
|
-
|
45
|
+
case strategy
|
46
|
+
when :sum
|
36
47
|
# keep count
|
37
|
-
|
48
|
+
when :sqrt, :unique
|
38
49
|
count = Math::sqrt(count)
|
39
|
-
|
50
|
+
when :log
|
40
51
|
count = Math::log10(count) if count > 0
|
41
52
|
else
|
42
53
|
raise "Invalid strategy: #{strategy}"
|
@@ -49,6 +60,9 @@ class Groupie
|
|
49
60
|
# Classify a text by taking the average of all word classifications.
|
50
61
|
def classify_text(words, strategy=:sum)
|
51
62
|
hits = 0
|
63
|
+
if strategy==:unique
|
64
|
+
words = words & unique_words
|
65
|
+
end
|
52
66
|
group_score_sums = words.inject({}) do |results, word|
|
53
67
|
word_results = classify(word, strategy)
|
54
68
|
next results if word_results.empty?
|
data/lib/groupie/group.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
class Groupie
|
2
2
|
class Group
|
3
|
+
attr_reader :word_counts
|
3
4
|
def initialize(name)
|
4
5
|
@name = name
|
5
6
|
@word_counts = {}
|
6
7
|
end
|
7
|
-
|
8
|
+
|
8
9
|
def words
|
9
10
|
@word_counts.keys
|
10
11
|
end
|
@@ -16,6 +17,7 @@ class Groupie
|
|
16
17
|
end
|
17
18
|
nil
|
18
19
|
end
|
20
|
+
alias << add
|
19
21
|
|
20
22
|
# Return the count for a specific +word+.
|
21
23
|
def count(word)
|
data/readme.rdoc
CHANGED
@@ -4,11 +4,7 @@ Groupie is a simple way to group texts and classify new texts as being a likely
|
|
4
4
|
|
5
5
|
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
Groupie is a 'fun' project that has the following goals, in descending order of importance:
|
10
|
-
* Have fun playing with code
|
11
|
-
* Play with Bayesian-like (spam) filtering
|
7
|
+
Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
|
12
8
|
|
13
9
|
== Current functionality
|
14
10
|
|
@@ -19,6 +15,7 @@ Current funcionality includes:
|
|
19
15
|
* Add texts (as an Array of Strings) to any number of groups.
|
20
16
|
* Classify a single word to check the likelihood it belongs to each group.
|
21
17
|
* Do classification for complete (tokenized) texts.
|
18
|
+
* Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
|
22
19
|
|
23
20
|
== License
|
24
21
|
|
data/spec/groupie/group_spec.rb
CHANGED
@@ -2,6 +2,31 @@ require File.join(File.dirname(__FILE__), %w[.. spec_helper])
|
|
2
2
|
require 'yaml'
|
3
3
|
|
4
4
|
describe Groupie::Group do
|
5
|
+
describe "add" do
|
6
|
+
before(:each) do
|
7
|
+
@group = Groupie::Group.new("test")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should accept a single string" do
|
11
|
+
@group.add "bla"
|
12
|
+
@group.words.should == %w[bla]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should accept an Array of strings" do
|
16
|
+
@group.add ["bla", "bla2"]
|
17
|
+
@group.words.should == %w[bla bla2]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should accept multiple strings" do
|
21
|
+
@group.add "bla", "bla2"
|
22
|
+
@group.words.should == %w[bla bla2]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be aliased as <<" do
|
26
|
+
@group.method(:add).should == @group.method(:<<)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
5
30
|
it "can be serialized and loaded through YAML" do
|
6
31
|
group = Groupie::Group.new 'group'
|
7
32
|
group.add %w[buy flowers]
|
data/spec/groupie_spec.rb
CHANGED
@@ -73,6 +73,32 @@ describe Groupie do
|
|
73
73
|
g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
describe "unique" do
|
78
|
+
it "should should behave as sqrt strategy" do
|
79
|
+
g = Groupie.new
|
80
|
+
g[:spam].add %w[buy viagra now]
|
81
|
+
g[:ham].add %w[buy flowers now]
|
82
|
+
g.classify('buy', :unique).should == g.classify('buy', :sqrt)
|
83
|
+
g.classify('flowers', :unique).should == g.classify('flowers', :sqrt)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "unique_words" do
|
90
|
+
it "should exclude all words in the 4th quintile of all groups" do
|
91
|
+
g = Groupie.new
|
92
|
+
g[:spam].add %w[one two two three three three four four four four]
|
93
|
+
g[:ham].add %w[apple banana pear orange three]
|
94
|
+
g.unique_words.sort.should == %w[one two apple banana pear orange].sort
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should work on an empty word set" do
|
98
|
+
g = Groupie.new
|
99
|
+
g[:spam].add []
|
100
|
+
g[:ham].add []
|
101
|
+
g.unique_words.should == []
|
76
102
|
end
|
77
103
|
end
|
78
104
|
|
@@ -108,7 +134,7 @@ describe Groupie do
|
|
108
134
|
g[:ham].add %w[buy flowers now]
|
109
135
|
g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
|
110
136
|
end
|
111
|
-
|
137
|
+
|
112
138
|
it "should support the sqrt strategy" do
|
113
139
|
g = Groupie.new
|
114
140
|
g[:spam].add %w[one] * 9
|
@@ -117,7 +143,7 @@ describe Groupie do
|
|
117
143
|
g[:ham].add %w[two]
|
118
144
|
g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
119
145
|
end
|
120
|
-
|
146
|
+
|
121
147
|
it "should support the log strategy" do
|
122
148
|
g = Groupie.new
|
123
149
|
g[:spam].add %w[one] * 100
|
@@ -126,5 +152,12 @@ describe Groupie do
|
|
126
152
|
g[:ham].add %w[two] * 100
|
127
153
|
g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
|
128
154
|
end
|
155
|
+
|
156
|
+
it "should only rate unique words for the unique strategy" do
|
157
|
+
g = Groupie.new
|
158
|
+
g[:spam].add %w[one two two three three three four four four four]
|
159
|
+
g[:ham].add %w[apple banana pear]
|
160
|
+
g.classify_text(%w[one two three apple banana], :unique).should == {:spam=>0.5, :ham=>0.5}
|
161
|
+
end
|
129
162
|
end
|
130
163
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Wes Oldenbeuving
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-29 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,10 +42,10 @@ extra_rdoc_files:
|
|
42
42
|
- LICENSE
|
43
43
|
files:
|
44
44
|
- .document
|
45
|
+
- .gitignore
|
45
46
|
- LICENSE
|
46
47
|
- Rakefile
|
47
48
|
- VERSION
|
48
|
-
- groupie.gemspec
|
49
49
|
- lib/groupie.rb
|
50
50
|
- lib/groupie/core_ext/string.rb
|
51
51
|
- lib/groupie/group.rb
|
data/groupie.gemspec
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
-
# -*- encoding: utf-8 -*-
|
5
|
-
|
6
|
-
Gem::Specification.new do |s|
|
7
|
-
s.name = %q{groupie}
|
8
|
-
s.version = "0.2.2"
|
9
|
-
|
10
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["Wes Oldenbeuving"]
|
12
|
-
s.date = %q{2010-07-25}
|
13
|
-
s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
14
|
-
s.email = %q{narnach@gmail.com}
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE"
|
17
|
-
]
|
18
|
-
s.files = [
|
19
|
-
".document",
|
20
|
-
"LICENSE",
|
21
|
-
"Rakefile",
|
22
|
-
"VERSION",
|
23
|
-
"groupie.gemspec",
|
24
|
-
"lib/groupie.rb",
|
25
|
-
"lib/groupie/core_ext/string.rb",
|
26
|
-
"lib/groupie/group.rb",
|
27
|
-
"readme.rdoc",
|
28
|
-
"spec/fixtures/ham/email_ham1.txt",
|
29
|
-
"spec/fixtures/ham/spam.la-44116217.txt",
|
30
|
-
"spec/fixtures/spam/email_spam1.txt",
|
31
|
-
"spec/fixtures/spam/email_spam2.txt",
|
32
|
-
"spec/fixtures/spam/spam.la-44118014.txt",
|
33
|
-
"spec/groupie/core_ext/string_spec.rb",
|
34
|
-
"spec/groupie/group_spec.rb",
|
35
|
-
"spec/groupie_spec.rb",
|
36
|
-
"spec/spec_helper.rb"
|
37
|
-
]
|
38
|
-
s.homepage = %q{http://github.com/Narnach/groupie}
|
39
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
40
|
-
s.require_paths = ["lib"]
|
41
|
-
s.rubygems_version = %q{1.3.7}
|
42
|
-
s.summary = %q{Group and classify text}
|
43
|
-
s.test_files = [
|
44
|
-
"spec/groupie/core_ext/string_spec.rb",
|
45
|
-
"spec/groupie/group_spec.rb",
|
46
|
-
"spec/groupie_spec.rb",
|
47
|
-
"spec/spec_helper.rb"
|
48
|
-
]
|
49
|
-
|
50
|
-
if s.respond_to? :specification_version then
|
51
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
52
|
-
s.specification_version = 3
|
53
|
-
|
54
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
-
s.add_development_dependency(%q<testy>, [">= 0"])
|
56
|
-
else
|
57
|
-
s.add_dependency(%q<testy>, [">= 0"])
|
58
|
-
end
|
59
|
-
else
|
60
|
-
s.add_dependency(%q<testy>, [">= 0"])
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|