groupie 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/dependabot.yml +21 -0
- data/.github/workflows/gem.yml +16 -0
- data/.github/workflows/rspec.yml +21 -0
- data/.github/workflows/rubocop.yml +26 -0
- data/.gitignore +11 -1
- data/.rspec +3 -0
- data/.rubocop.yml +38 -0
- data/CHANGELOG.md +86 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +67 -0
- data/LICENSE.txt +21 -0
- data/README.md +121 -0
- data/Rakefile +7 -48
- data/bin/console +15 -0
- data/bin/rubocop +2 -0
- data/bin/setup +9 -0
- data/groupie.gemspec +36 -0
- data/lib/groupie/core_ext/string.rb +6 -6
- data/lib/groupie/group.rb +8 -2
- data/lib/groupie/version.rb +10 -0
- data/lib/groupie.rb +105 -58
- metadata +53 -85
- data/.document +0 -5
- data/LICENSE +0 -20
- data/VERSION +0 -1
- data/readme.rdoc +0 -24
- data/spec/fixtures/ham/email_ham1.txt +0 -13
- data/spec/fixtures/ham/spam.la-44116217.txt +0 -79
- data/spec/fixtures/spam/email_spam1.txt +0 -5
- data/spec/fixtures/spam/email_spam2.txt +0 -7
- data/spec/fixtures/spam/spam.la-44118014.txt +0 -73
- data/spec/groupie/core_ext/string_spec.rb +0 -37
- data/spec/groupie/group_spec.rb +0 -37
- data/spec/groupie_spec.rb +0 -163
- data/spec/spec_helper.rb +0 -1
data/.document
DELETED
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Wes Oldenbeuving
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.3.0
|
data/readme.rdoc
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
= Groupie
|
2
|
-
|
3
|
-
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
4
|
-
|
5
|
-
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
|
-
|
7
|
-
Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
|
8
|
-
|
9
|
-
== Current functionality
|
10
|
-
|
11
|
-
Current funcionality includes:
|
12
|
-
* Tokenize an input text to prepare it for grouping.
|
13
|
-
* Strip XML and HTML tag.
|
14
|
-
* Keep certain infix characters, such as period and comma.
|
15
|
-
* Add texts (as an Array of Strings) to any number of groups.
|
16
|
-
* Classify a single word to check the likelihood it belongs to each group.
|
17
|
-
* Do classification for complete (tokenized) texts.
|
18
|
-
* Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
|
19
|
-
|
20
|
-
== License
|
21
|
-
|
22
|
-
As always, the code is licensed under the MIT license.
|
23
|
-
|
24
|
-
Wes Oldenbeuving
|
@@ -1,13 +0,0 @@
|
|
1
|
-
Re: [ubuntu-art] [Breathe] Network Manager-icons
|
2
|
-
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
3
|
-
> Hello,
|
4
|
-
>
|
5
|
-
> I think the notify-osd icons have a completely different style, which
|
6
|
-
> is looking great within the notification bubbles, but i doubt it'd
|
7
|
-
> look great to have the notify-osd wifi icons in the panel. I think the
|
8
|
-
> drawing of the notification- wifi icons should be done afterwards, and
|
9
|
-
> if they should be based on those of the icon set, they could be made
|
10
|
-
> smoother, and possibly desaturated for some of them, to avoid drawing
|
11
|
-
> too much attention from the user when popping up.
|
12
|
-
>
|
13
|
-
> Cordially, SD.
|
@@ -1,79 +0,0 @@
|
|
1
|
-
From **HIDDEN**@lists.ubuntu.com Sun May 31 10:22:46 2009
|
2
|
-
Return-Path: <**HIDDEN**@lists.ubuntu.com>
|
3
|
-
X-Original-To: **HIDDEN**@spam.la
|
4
|
-
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
-
Received: from chlorine.canonical.com (chlorine.canonical.com [91.189.94.204])
|
6
|
-
by speedo.dreamhost.com (Postfix) with ESMTP id 18A4F145730
|
7
|
-
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:22:47 -0700 (PDT)
|
8
|
-
Received: from localhost ([127.0.0.1] helo=chlorine.canonical.com)
|
9
|
-
by chlorine.canonical.com with esmtp (Exim 4.60)
|
10
|
-
(envelope-from <**HIDDEN**@lists.ubuntu.com>)
|
11
|
-
id 1MAoKV-0000Uv-RB; Sun, 31 May 2009 17:56:15 +0100
|
12
|
-
Received: from smtp104.mail.ukl.yahoo.com ([77.238.184.36])
|
13
|
-
by chlorine.canonical.com with smtp (Exim 4.60)
|
14
|
-
(envelope-from <**HIDDEN**@yahoo.de>) id 1MAoKO-0000T5-0J
|
15
|
-
for **HIDDEN**@lists.ubuntu.com; Sun, 31 May 2009 17:56:08 +0100
|
16
|
-
Received: (qmail 95693 invoked from network); 31 May 2009 16:56:07 -0000
|
17
|
-
Received: from unknown (HELO ?192.168.1.33?) **HIDDEN**@88.5.92.30 with plain)
|
18
|
-
by smtp104.mail.ukl.yahoo.com with SMTP; 31 May 2009 16:56:07 -0000
|
19
|
-
X-Yahoo-SMTP: omQsrMiswBC_IZdIQhRgQAA3Gn6tTTc-
|
20
|
-
X-YMail-OSG: e1ihGm4VM1ljqtG.6IPGOps5aG8IYZJEPQLptGPSxphH174zk4rRTWYQJmj9MMc2nJwZjNEqUnYAjErWKypElvLWu0n.v8baMMlcOOELQK2IZfFaV5Ij3HUpUDWRbd0n6PCV5iFLHlyruq5CSGsiZvfME6HpngIO0RuAcin3rePXdzWpmPnTlZwuC3qjSE9N8wC4pdBdwfmYHy4EKSKFRCXUNzdy9DPgfwqrjiCTP_tqaWmpeUOqA2Os13l0j5d6acIpgo9DcW8P_1ENNVGjJ2Lk4XbZ0oc51M_BJ2n6DHMxoazT
|
21
|
-
X-Yahoo-Newman-Property: ymail-3
|
22
|
-
From: Oliver Scholtz 1 <**HIDDEN**@yahoo.de>
|
23
|
-
To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
24
|
-
In-Reply-To: <**HIDDEN**@mail.gmail.com>
|
25
|
-
References: <**HIDDEN**@yahoo.com>
|
26
|
-
<**HIDDEN**@jws141-laptop> <**HIDDEN**@yahoo.com>
|
27
|
-
<**HIDDEN**@isabel-desktop>
|
28
|
-
<**HIDDEN**@web95411.mail.in2.yahoo.com>
|
29
|
-
<**HIDDEN**@dani-desktop> <**HIDDEN**@yahoo.com>
|
30
|
-
<**HIDDEN**@mail.gmail.com>
|
31
|
-
<**HIDDEN**@mail.gmail.com>
|
32
|
-
Date: Sun, 31 May 2009 18:56:05 +0200
|
33
|
-
Message-Id: <**HIDDEN**@oliver-ubuntu>
|
34
|
-
Mime-Version: 1.0
|
35
|
-
X-Mailer: Evolution 2.26.1
|
36
|
-
Subject: Re: [ubuntu-art] [Breathe] Network Manager-icons
|
37
|
-
X-BeenThere: **HIDDEN**@lists.ubuntu.com
|
38
|
-
X-Mailman-Version: 2.1.8
|
39
|
-
Precedence: list
|
40
|
-
Reply-To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
41
|
-
List-Id: Discussion on Ubuntu artwork <ubuntu-art.lists.ubuntu.com>
|
42
|
-
List-Unsubscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
43
|
-
<**HIDDEN**@lists.ubuntu.com?subject=unsubscribe>
|
44
|
-
List-Archive: <https://lists.ubuntu.com/archives/ubuntu-art>
|
45
|
-
List-Post: <**HIDDEN**@lists.ubuntu.com>
|
46
|
-
List-Help: <**HIDDEN**@lists.ubuntu.com?subject=help>
|
47
|
-
List-Subscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
48
|
-
<**HIDDEN**@lists.ubuntu.com?subject=subscribe>
|
49
|
-
Content-Type: text/plain; charset="us-ascii"
|
50
|
-
Content-Transfer-Encoding: 7bit
|
51
|
-
Sender: **HIDDEN**@lists.ubuntu.com
|
52
|
-
Errors-To: **HIDDEN**@lists.ubuntu.com
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
57
|
-
> Hello,
|
58
|
-
>
|
59
|
-
> I think the notify-osd icons have a completely different style, which
|
60
|
-
> is looking great within the notification bubbles, but i doubt it'd
|
61
|
-
> look great to have the notify-osd wifi icons in the panel. I think the
|
62
|
-
> drawing of the notification- wifi icons should be done afterwards, and
|
63
|
-
> if they should be based on those of the icon set, they could be made
|
64
|
-
> smoother, and possibly desaturated for some of them, to avoid drawing
|
65
|
-
> too much attention from the user when popping up.
|
66
|
-
>
|
67
|
-
> Cordially, SD.
|
68
|
-
|
69
|
-
+1
|
70
|
-
---
|
71
|
-
And Mac ... much better! Maybe 22 and 16 without pale. ;)
|
72
|
-
|
73
|
-
Oliver
|
74
|
-
|
75
|
-
|
76
|
-
--
|
77
|
-
ubuntu-art mailing list
|
78
|
-
**HIDDEN**@lists.ubuntu.com
|
79
|
-
https://lists.ubuntu.com/mailman/listinfo/ubuntu-art
|
@@ -1,7 +0,0 @@
|
|
1
|
-
Re: Your subscribe #976589
|
2
|
-
Tell a friend · Download latest version See this email as a webpage
|
3
|
-
Hello!
|
4
|
-
Shipped Privately And Discreetly To Your Door!
|
5
|
-
We want to put a great big grin on your face in 2009. You'll be to rejoice all year.
|
6
|
-
Unsubscribe · Lost Password · Account Settings · Help · Terms of Service · Privacy
|
7
|
-
Ottho Heldringstraat 2, 31719 AZ Amsterdam, The Netherlands
|
@@ -1,73 +0,0 @@
|
|
1
|
-
From **HIDDEN**@manpoints.net Sun May 31 10:34:01 2009
|
2
|
-
Return-Path: <**HIDDEN**@manpoints.net>
|
3
|
-
X-Original-To: **HIDDEN**@spam.la
|
4
|
-
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
-
Received: from 201-40-49-243.bsace702.dsl.brasiltelecom.net.br (201-40-49-243.bsace702.dsl.brasiltelecom.net.br [201.40.49.243])
|
6
|
-
by speedo.dreamhost.com (Postfix) with ESMTP id 4BDC714572F
|
7
|
-
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
8
|
-
Message-Id: <**HIDDEN**@201-40-49-243.bsace702.dsl.brasiltelecom.net.br>
|
9
|
-
From: "Leskovar L. Golda" <**HIDDEN**@manpoints.net>
|
10
|
-
To: **HIDDEN**@spam.la
|
11
|
-
Subject: My official mail blocked
|
12
|
-
Content-Type: text/html; charset="iso-8859-1"
|
13
|
-
Content-Transfer-Encoding: 7bit
|
14
|
-
MIME-Version: 1.0
|
15
|
-
Date: Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
20
|
-
<html>
|
21
|
-
<head>
|
22
|
-
<title></title>
|
23
|
-
<meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
|
24
|
-
</head>
|
25
|
-
<body bgcolor="#FFFFFF" topmargin="0" leftmargin="0" marginwidth="0" marginheight="0">
|
26
|
-
|
27
|
-
|
28
|
-
<table width="646" cellspacing="0" border="0" align="center" cellpadding="4">
|
29
|
-
<tr>
|
30
|
-
<td align="center"><font face="Arial" size="1" color="#000000">If you cannot see
|
31
|
-
the pictures and links below, please <a href="http://www.qicweman.cn/?abo=0C8C72B3E1C8648676158B">
|
32
|
-
click here</a> to view them.<br></font></td>
|
33
|
-
</tr>
|
34
|
-
|
35
|
-
</table>
|
36
|
-
|
37
|
-
<table width="742" cellspacing="0" border="0" align="center" cellpadding="0">
|
38
|
-
|
39
|
-
<tr valign="top">
|
40
|
-
<td width="475" style="border-left:1px solid #371E96;">
|
41
|
-
<a href="http://www.qicweman.cn/?ex=0C8C72B3E1C8648676158B">
|
42
|
-
<img alt="click to see the full version" src="http://www.qicweman.cn/d.jpg" style="border-width: 0px" /></a></td>
|
43
|
-
|
44
|
-
</tr>
|
45
|
-
|
46
|
-
<tr>
|
47
|
-
<td><br>
|
48
|
-
|
49
|
-
<div style="padding:10px;">
|
50
|
-
|
51
|
-
<span style="font-size:10px;color:#666666;font-family:arial;">You may also
|
52
|
-
respond to this email and subscribe to <i>Hizqru</i> by calling
|
53
|
-
1-085-417-9085, Monday-Friday, 8 a.m.-6 p.m. ET. Outside the U.S. and in
|
54
|
-
Canada, please call 1-254-403-7409.<br><br>
|
55
|
-
|
56
|
-
To opt out from receiving any future marketing-related emails from
|
57
|
-
Dqpjnjp, please <a style="color:#666666;" href="http://www.qicweman.cn/?jr=0C8C72B3E1C8648676158B&email=**HIDDEN**@spam.la">
|
58
|
-
click here</a>.<br />
|
59
|
-
Please be assured that we respect the privacy of our subscribers. To view
|
60
|
-
our privacy policy, please <a style="color:#666666;" href="http://www.qicweman.cn/?sj=0C8C72B3E1C8648676158B">
|
61
|
-
click here</a>.<br><br>
|
62
|
-
|
63
|
-
© 2009 Jdqofypy, Inc., 14 Poze Iylqod, 08th Floor, New York, NY 74172.<br></span>
|
64
|
-
|
65
|
-
</div>
|
66
|
-
</td>
|
67
|
-
</tr>
|
68
|
-
</table>
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
</body>
|
73
|
-
</html>
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), %w[.. .. spec_helper])
|
2
|
-
|
3
|
-
describe String do
|
4
|
-
context "tokenize" do
|
5
|
-
it 'should split words' do
|
6
|
-
"hello world".tokenize.should == %w[hello world]
|
7
|
-
end
|
8
|
-
|
9
|
-
it 'should downcase words' do
|
10
|
-
"Hello World".tokenize.should == %w[hello world]
|
11
|
-
end
|
12
|
-
|
13
|
-
it 'should strip special characters' do
|
14
|
-
"blah, bla!".tokenize.should == %w[blah bla]
|
15
|
-
end
|
16
|
-
|
17
|
-
it 'should prserve infix hyphens and underscores' do
|
18
|
-
"hyphen-ated under_score".tokenize.should == %w[hyphen-ated under_score]
|
19
|
-
end
|
20
|
-
|
21
|
-
it 'should sanitize html tags' do
|
22
|
-
'<a href="http://example.org">example</a>'.tokenize.should == %w[example]
|
23
|
-
end
|
24
|
-
|
25
|
-
it 'should preserve infix periods' do
|
26
|
-
'example.org rocks. read it...'.tokenize.should == %w[example.org rocks read it]
|
27
|
-
end
|
28
|
-
|
29
|
-
it "should preserve infix commas" do
|
30
|
-
'$1,000,000.00 or $1.000.000,00'.tokenize.should == %w[1,000,000.00 or 1.000.000,00]
|
31
|
-
end
|
32
|
-
|
33
|
-
it "should strip quotes around tokens" do
|
34
|
-
'"first last"'.tokenize.should == %w[first last]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
data/spec/groupie/group_spec.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), %w[.. spec_helper])
|
2
|
-
require 'yaml'
|
3
|
-
|
4
|
-
describe Groupie::Group do
|
5
|
-
describe "add" do
|
6
|
-
before(:each) do
|
7
|
-
@group = Groupie::Group.new("test")
|
8
|
-
end
|
9
|
-
|
10
|
-
it "should accept a single string" do
|
11
|
-
@group.add "bla"
|
12
|
-
@group.words.should == %w[bla]
|
13
|
-
end
|
14
|
-
|
15
|
-
it "should accept an Array of strings" do
|
16
|
-
@group.add ["bla", "bla2"]
|
17
|
-
@group.words.should == %w[bla bla2]
|
18
|
-
end
|
19
|
-
|
20
|
-
it "should accept multiple strings" do
|
21
|
-
@group.add "bla", "bla2"
|
22
|
-
@group.words.should == %w[bla bla2]
|
23
|
-
end
|
24
|
-
|
25
|
-
it "should be aliased as <<" do
|
26
|
-
@group.method(:add).should == @group.method(:<<)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
it "can be serialized and loaded through YAML" do
|
31
|
-
group = Groupie::Group.new 'group'
|
32
|
-
group.add %w[buy flowers]
|
33
|
-
loaded_group = YAML.load(group.to_yaml)
|
34
|
-
loaded_group.add %w[buy candy]
|
35
|
-
loaded_group.count("candy").should == 1
|
36
|
-
end
|
37
|
-
end
|
data/spec/groupie_spec.rb
DELETED
@@ -1,163 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
-
|
3
|
-
describe Groupie do
|
4
|
-
describe "classify" do
|
5
|
-
it 'should work when 100% certaint' do
|
6
|
-
g = Groupie.new
|
7
|
-
g[:spam].add %w[viagra]
|
8
|
-
g[:ham].add %w[flowers]
|
9
|
-
g.classify('viagra').should == {:spam => 1.0, :ham => 0.0}
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'should work when split 50/50 between two groups' do
|
13
|
-
g = Groupie.new
|
14
|
-
g[:spam].add %w[buy viagra now]
|
15
|
-
g[:ham].add %w[buy flowers for your mom]
|
16
|
-
g.classify('buy').should == {:spam => 0.5, :ham => 0.5}
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'should work when weighed more towards one group' do
|
20
|
-
g = Groupie.new
|
21
|
-
g[:spam].add %w[buy viagra now]
|
22
|
-
g[:spam].add %w[buy cialis now]
|
23
|
-
g[:ham].add %w[buy flowers for your mom]
|
24
|
-
g.classify('buy').should == {:spam => 2 / 3.0, :ham => 1 / 3.0}
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'should work with more than two groups' do
|
28
|
-
g = Groupie.new
|
29
|
-
g[:weight].add 'pound'
|
30
|
-
g[:currency].add 'pound'
|
31
|
-
g[:phone_key].add 'pound'
|
32
|
-
g.classify('pound').should == {:weight => 1/3.0, :currency => 1/3.0, :phone_key => 1/3.0}
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'should tokenize and classify emails' do
|
36
|
-
email = File.read(File.join(File.dirname(__FILE__), %w[fixtures spam email_spam1.txt]))
|
37
|
-
email2 = File.read(File.join(File.dirname(__FILE__), %w[fixtures spam email_spam2.txt]))
|
38
|
-
email3 = File.read(File.join(File.dirname(__FILE__), %w[fixtures ham email_ham1.txt]))
|
39
|
-
g = Groupie.new
|
40
|
-
g[:spam].add email.tokenize
|
41
|
-
g[:spam].add email2.tokenize
|
42
|
-
g[:ham].add email3.tokenize
|
43
|
-
c = g.classify('discreetly')
|
44
|
-
c[:spam].should > c[:ham]
|
45
|
-
c2 = g.classify('user')
|
46
|
-
c2[:ham].should > c2[:spam]
|
47
|
-
end
|
48
|
-
|
49
|
-
describe "strategies" do
|
50
|
-
describe "sum" do
|
51
|
-
it "should weigh words for the sum of their occurances" do
|
52
|
-
g = Groupie.new
|
53
|
-
g[:spam].add %w[word] * 9
|
54
|
-
g[:ham].add %w[word]
|
55
|
-
g.classify('word', :sum).should == {:spam=>0.9, :ham=>0.1}
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
describe "sqrt" do
|
60
|
-
it "should weigh words for the square root of the sum of ocurances" do
|
61
|
-
g = Groupie.new
|
62
|
-
g[:spam].add %w[word] * 9
|
63
|
-
g[:ham].add %w[word]
|
64
|
-
g.classify('word', :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
describe "log" do
|
69
|
-
it "should weigh words for log10 of their sum of occurances" do
|
70
|
-
g = Groupie.new
|
71
|
-
g[:spam].add %w[word] * 1000
|
72
|
-
g[:ham].add %w[word] * 10
|
73
|
-
g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
describe "unique" do
|
78
|
-
it "should should behave as sqrt strategy" do
|
79
|
-
g = Groupie.new
|
80
|
-
g[:spam].add %w[buy viagra now]
|
81
|
-
g[:ham].add %w[buy flowers now]
|
82
|
-
g.classify('buy', :unique).should == g.classify('buy', :sqrt)
|
83
|
-
g.classify('flowers', :unique).should == g.classify('flowers', :sqrt)
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
describe "unique_words" do
|
90
|
-
it "should exclude all words in the 4th quintile of all groups" do
|
91
|
-
g = Groupie.new
|
92
|
-
g[:spam].add %w[one two two three three three four four four four]
|
93
|
-
g[:ham].add %w[apple banana pear orange three]
|
94
|
-
g.unique_words.sort.should == %w[one two apple banana pear orange].sort
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should work on an empty word set" do
|
98
|
-
g = Groupie.new
|
99
|
-
g[:spam].add []
|
100
|
-
g[:ham].add []
|
101
|
-
g.unique_words.should == []
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
context "classify_text" do
|
106
|
-
it 'should tokenized html emails' do
|
107
|
-
g = Groupie.new
|
108
|
-
spam_tokens = File.read(File.join(File.dirname(__FILE__), %w[fixtures spam spam.la-44118014.txt])).tokenize
|
109
|
-
ham_tokens = File.read(File.join(File.dirname(__FILE__), %w[fixtures ham spam.la-44116217.txt])).tokenize
|
110
|
-
g[:spam].add spam_tokens
|
111
|
-
g[:ham].add ham_tokens
|
112
|
-
|
113
|
-
c = g.classify 'user'
|
114
|
-
c[:ham].should > c[:spam]
|
115
|
-
|
116
|
-
c = g.classify_text(spam_tokens)
|
117
|
-
c[:spam].should > c[:ham]
|
118
|
-
end
|
119
|
-
|
120
|
-
it 'should classify a text' do
|
121
|
-
g = Groupie.new
|
122
|
-
g[:spam].add %w[buy viagra now to grow fast]
|
123
|
-
g[:spam].add %w[buy cialis on our website]
|
124
|
-
g[:ham].add %w[buy flowers for your mom]
|
125
|
-
result = g.classify_text "Grow flowers to sell on our website".tokenize
|
126
|
-
result[:spam].should > result[:ham]
|
127
|
-
result2 = g.classify_text "Grow flowers to give to your mom".tokenize
|
128
|
-
result2[:ham].should == result2[:spam]
|
129
|
-
end
|
130
|
-
|
131
|
-
it "should skip unknown tokens" do
|
132
|
-
g = Groupie.new
|
133
|
-
g[:spam].add %w[buy viagra now]
|
134
|
-
g[:ham].add %w[buy flowers now]
|
135
|
-
g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
|
136
|
-
end
|
137
|
-
|
138
|
-
it "should support the sqrt strategy" do
|
139
|
-
g = Groupie.new
|
140
|
-
g[:spam].add %w[one] * 9
|
141
|
-
g[:ham].add %w[one]
|
142
|
-
g[:spam].add %w[two] * 9
|
143
|
-
g[:ham].add %w[two]
|
144
|
-
g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
|
145
|
-
end
|
146
|
-
|
147
|
-
it "should support the log strategy" do
|
148
|
-
g = Groupie.new
|
149
|
-
g[:spam].add %w[one] * 100
|
150
|
-
g[:ham].add %w[one]
|
151
|
-
g[:spam].add %w[two]
|
152
|
-
g[:ham].add %w[two] * 100
|
153
|
-
g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
|
154
|
-
end
|
155
|
-
|
156
|
-
it "should only rate unique words for the unique strategy" do
|
157
|
-
g = Groupie.new
|
158
|
-
g[:spam].add %w[one two two three three three four four four four]
|
159
|
-
g[:ham].add %w[apple banana pear]
|
160
|
-
g.classify_text(%w[one two three apple banana], :unique).should == {:spam=>0.5, :ham=>0.5}
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|