groupie 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Wes Oldenbeuving
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "groupie"
8
+ gem.summary = %Q{Group and classify text}
9
+ gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
10
+ gem.email = "narnach@gmail.com"
11
+ gem.homepage = "http://github.com/Narnach/groupie"
12
+ gem.authors = ["Wes Oldenbeuving"]
13
+ gem.add_development_dependency "testy", ">= 0"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "groupie #{version}"
51
+ rdoc.rdoc_files.include('readme*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{groupie}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Wes Oldenbeuving"]
12
+ s.date = %q{2010-07-25}
13
+ s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
14
+ s.email = %q{narnach@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ "LICENSE",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "groupie.gemspec",
24
+ "lib/groupie.rb",
25
+ "lib/groupie/core_ext/string.rb",
26
+ "lib/groupie/group.rb",
27
+ "readme.rdoc",
28
+ "test/fixtures/ham/spam.la-44116217.txt",
29
+ "test/fixtures/spam/spam.la-44118014.txt",
30
+ "test/groupie/core_ext/string_test.rb",
31
+ "test/groupie/group_test.rb",
32
+ "test/groupie_test.rb",
33
+ "test/test_helper.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/Narnach/groupie}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{Group and classify text}
40
+ s.test_files = [
41
+ "test/groupie/core_ext/string_test.rb",
42
+ "test/groupie/group_test.rb",
43
+ "test/groupie_test.rb",
44
+ "test/test_helper.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
+ s.add_development_dependency(%q<testy>, [">= 0"])
53
+ else
54
+ s.add_dependency(%q<testy>, [">= 0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<testy>, [">= 0"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,51 @@
1
+ lib_dir = File.expand_path(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
+ require 'groupie/group'
4
+ require 'groupie/core_ext/string'
5
+
6
+ class Groupie
7
+ def initialize
8
+ @groups = {}
9
+ end
10
+
11
+ def [](group)
12
+ @groups[group] ||= Group.new(group)
13
+ end
14
+
15
+ def classify(entry)
16
+ results = {}
17
+ total_count = @groups.inject(0) do |sum, name_group|
18
+ group = name_group.last
19
+ sum + group.count(entry)
20
+ end
21
+ return results if 0 == total_count
22
+
23
+ @groups.each do |name, group|
24
+ count = group.count(entry)
25
+ results[name] = count > 0 ? count.to_f / total_count : 0.0
26
+ end
27
+ return results
28
+ end
29
+
30
+ # Classify a text by taking the average of all word classifications.
31
+ def classify_text(words)
32
+ group_score_sums = words.inject({}) do |results, word|
33
+ word_results = classify(word)
34
+ results.merge(word_results) do |key, old, new|
35
+ old + new
36
+ end
37
+ end
38
+
39
+ words_count = words.size.to_f
40
+ averages={}
41
+ group_score_sums.each do |group, sum|
42
+ averages[group] = sum / words_count
43
+ end
44
+
45
+ averages
46
+ end
47
+
48
+ def self.version
49
+ File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
50
+ end
51
+ end
@@ -0,0 +1,17 @@
1
+ class Groupie
2
+ module CoreExt
3
+ module String
4
+ def tokenize
5
+ downcase.
6
+ gsub(/\s/," ").
7
+ gsub(/[$']/,'').
8
+ gsub(/<[^>]+?>|[^\w -.,]/,'').
9
+ split(" ").map {|str| str.gsub(/[,.]+\Z/,'')}
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ class String
16
+ include Groupie::CoreExt::String
17
+ end
@@ -0,0 +1,32 @@
1
+ class Groupie
2
+ class Group
3
+ def initialize(name)
4
+ @name = name
5
+ @word_counts = {}
6
+ end
7
+
8
+ def words
9
+ @word_counts.keys
10
+ end
11
+
12
+ # Add new words to the group.
13
+ def add(*words)
14
+ words.flatten.each do |word|
15
+ add_word(word)
16
+ end
17
+ nil
18
+ end
19
+
20
+ # Return the count for a specific +word+.
21
+ def count(word)
22
+ @word_counts[word] || 0
23
+ end
24
+
25
+ # Add a single word and count it.
26
+ def add_word(word)
27
+ @word_counts[word] ||= 0
28
+ @word_counts[word] += 1
29
+ end
30
+ private :add_word
31
+ end
32
+ end
@@ -0,0 +1,28 @@
1
+ = Groupie
2
+
3
+ Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
4
+
5
+ The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
6
+
7
+ == Goals
8
+
9
+ Groupie is a 'fun' project that has the following goals, in descending order of importance:
10
+ * Have fun playing with code
11
+ * Play with Bayesian-like (spam) filtering
12
+ * Check out the Testy BDD framework. It's pretty good for 60 lines of code!
13
+
14
+ == Current functionality
15
+
16
+ Current funcionality includes:
17
+ * Tokenize an input text to prepare it for grouping.
18
+ * Strip XML and HTML tag.
19
+ * Keep certain infix characters, such as period and comma.
20
+ * Add texts (as an Array of Strings) to any number of groups.
21
+ * Classify a single word to check the likelihood it belongs to each group.
22
+ * Do classification for complete (tokenized) texts.
23
+
24
+ == License
25
+
26
+ As always, the code is licensed under the MIT license.
27
+
28
+ Wes Oldenbeuving
@@ -0,0 +1,79 @@
1
+ From **HIDDEN**@lists.ubuntu.com Sun May 31 10:22:46 2009
2
+ Return-Path: <**HIDDEN**@lists.ubuntu.com>
3
+ X-Original-To: **HIDDEN**@spam.la
4
+ Delivered-To: **HIDDEN**@speedo.dreamhost.com
5
+ Received: from chlorine.canonical.com (chlorine.canonical.com [91.189.94.204])
6
+ by speedo.dreamhost.com (Postfix) with ESMTP id 18A4F145730
7
+ for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:22:47 -0700 (PDT)
8
+ Received: from localhost ([127.0.0.1] helo=chlorine.canonical.com)
9
+ by chlorine.canonical.com with esmtp (Exim 4.60)
10
+ (envelope-from <**HIDDEN**@lists.ubuntu.com>)
11
+ id 1MAoKV-0000Uv-RB; Sun, 31 May 2009 17:56:15 +0100
12
+ Received: from smtp104.mail.ukl.yahoo.com ([77.238.184.36])
13
+ by chlorine.canonical.com with smtp (Exim 4.60)
14
+ (envelope-from <**HIDDEN**@yahoo.de>) id 1MAoKO-0000T5-0J
15
+ for **HIDDEN**@lists.ubuntu.com; Sun, 31 May 2009 17:56:08 +0100
16
+ Received: (qmail 95693 invoked from network); 31 May 2009 16:56:07 -0000
17
+ Received: from unknown (HELO ?192.168.1.33?) **HIDDEN**@88.5.92.30 with plain)
18
+ by smtp104.mail.ukl.yahoo.com with SMTP; 31 May 2009 16:56:07 -0000
19
+ X-Yahoo-SMTP: omQsrMiswBC_IZdIQhRgQAA3Gn6tTTc-
20
+ X-YMail-OSG: e1ihGm4VM1ljqtG.6IPGOps5aG8IYZJEPQLptGPSxphH174zk4rRTWYQJmj9MMc2nJwZjNEqUnYAjErWKypElvLWu0n.v8baMMlcOOELQK2IZfFaV5Ij3HUpUDWRbd0n6PCV5iFLHlyruq5CSGsiZvfME6HpngIO0RuAcin3rePXdzWpmPnTlZwuC3qjSE9N8wC4pdBdwfmYHy4EKSKFRCXUNzdy9DPgfwqrjiCTP_tqaWmpeUOqA2Os13l0j5d6acIpgo9DcW8P_1ENNVGjJ2Lk4XbZ0oc51M_BJ2n6DHMxoazT
21
+ X-Yahoo-Newman-Property: ymail-3
22
+ From: Oliver Scholtz 1 <**HIDDEN**@yahoo.de>
23
+ To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
24
+ In-Reply-To: <**HIDDEN**@mail.gmail.com>
25
+ References: <**HIDDEN**@yahoo.com>
26
+ <**HIDDEN**@jws141-laptop> <**HIDDEN**@yahoo.com>
27
+ <**HIDDEN**@isabel-desktop>
28
+ <**HIDDEN**@web95411.mail.in2.yahoo.com>
29
+ <**HIDDEN**@dani-desktop> <**HIDDEN**@yahoo.com>
30
+ <**HIDDEN**@mail.gmail.com>
31
+ <**HIDDEN**@mail.gmail.com>
32
+ Date: Sun, 31 May 2009 18:56:05 +0200
33
+ Message-Id: <**HIDDEN**@oliver-ubuntu>
34
+ Mime-Version: 1.0
35
+ X-Mailer: Evolution 2.26.1
36
+ Subject: Re: [ubuntu-art] [Breathe] Network Manager-icons
37
+ X-BeenThere: **HIDDEN**@lists.ubuntu.com
38
+ X-Mailman-Version: 2.1.8
39
+ Precedence: list
40
+ Reply-To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
41
+ List-Id: Discussion on Ubuntu artwork <ubuntu-art.lists.ubuntu.com>
42
+ List-Unsubscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
43
+ <**HIDDEN**@lists.ubuntu.com?subject=unsubscribe>
44
+ List-Archive: <https://lists.ubuntu.com/archives/ubuntu-art>
45
+ List-Post: <**HIDDEN**@lists.ubuntu.com>
46
+ List-Help: <**HIDDEN**@lists.ubuntu.com?subject=help>
47
+ List-Subscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
48
+ <**HIDDEN**@lists.ubuntu.com?subject=subscribe>
49
+ Content-Type: text/plain; charset="us-ascii"
50
+ Content-Transfer-Encoding: 7bit
51
+ Sender: **HIDDEN**@lists.ubuntu.com
52
+ Errors-To: **HIDDEN**@lists.ubuntu.com
53
+
54
+
55
+
56
+ Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
57
+ > Hello,
58
+ >
59
+ > I think the notify-osd icons have a completely different style, which
60
+ > is looking great within the notification bubbles, but i doubt it'd
61
+ > look great to have the notify-osd wifi icons in the panel. I think the
62
+ > drawing of the notification- wifi icons should be done afterwards, and
63
+ > if they should be based on those of the icon set, they could be made
64
+ > smoother, and possibly desaturated for some of them, to avoid drawing
65
+ > too much attention from the user when popping up.
66
+ >
67
+ > Cordially, SD.
68
+
69
+ +1
70
+ ---
71
+ And Mac ... much better! Maybe 22 and 16 without pale. ;)
72
+
73
+ Oliver
74
+
75
+
76
+ --
77
+ ubuntu-art mailing list
78
+ **HIDDEN**@lists.ubuntu.com
79
+ https://lists.ubuntu.com/mailman/listinfo/ubuntu-art
@@ -0,0 +1,73 @@
1
+ From **HIDDEN**@manpoints.net Sun May 31 10:34:01 2009
2
+ Return-Path: <**HIDDEN**@manpoints.net>
3
+ X-Original-To: **HIDDEN**@spam.la
4
+ Delivered-To: **HIDDEN**@speedo.dreamhost.com
5
+ Received: from 201-40-49-243.bsace702.dsl.brasiltelecom.net.br (201-40-49-243.bsace702.dsl.brasiltelecom.net.br [201.40.49.243])
6
+ by speedo.dreamhost.com (Postfix) with ESMTP id 4BDC714572F
7
+ for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:33:56 -0700 (PDT)
8
+ Message-Id: <**HIDDEN**@201-40-49-243.bsace702.dsl.brasiltelecom.net.br>
9
+ From: "Leskovar L. Golda" <**HIDDEN**@manpoints.net>
10
+ To: **HIDDEN**@spam.la
11
+ Subject: My official mail blocked
12
+ Content-Type: text/html; charset="iso-8859-1"
13
+ Content-Transfer-Encoding: 7bit
14
+ MIME-Version: 1.0
15
+ Date: Sun, 31 May 2009 10:33:56 -0700 (PDT)
16
+
17
+
18
+
19
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
20
+ <html>
21
+ <head>
22
+ <title></title>
23
+ <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
24
+ </head>
25
+ <body bgcolor="#FFFFFF" topmargin="0" leftmargin="0" marginwidth="0" marginheight="0">
26
+
27
+
28
+ <table width="646" cellspacing="0" border="0" align="center" cellpadding="4">
29
+ <tr>
30
+ <td align="center"><font face="Arial" size="1" color="#000000">If you cannot see
31
+ the pictures and links below, please <a href="http://www.qicweman.cn/?abo=0C8C72B3E1C8648676158B">
32
+ click here</a> to view them.<br></font></td>
33
+ </tr>
34
+
35
+ </table>
36
+
37
+ <table width="742" cellspacing="0" border="0" align="center" cellpadding="0">
38
+
39
+ <tr valign="top">
40
+ <td width="475" style="border-left:1px solid #371E96;">
41
+ <a href="http://www.qicweman.cn/?ex=0C8C72B3E1C8648676158B">
42
+ <img alt="click to see the full version" src="http://www.qicweman.cn/d.jpg" style="border-width: 0px" /></a></td>
43
+
44
+ </tr>
45
+
46
+ <tr>
47
+ <td><br>
48
+
49
+ <div style="padding:10px;">
50
+
51
+ <span style="font-size:10px;color:#666666;font-family:arial;">You may also
52
+ respond to this email and subscribe to <i>Hizqru</i> by calling
53
+ 1-085-417-9085, Monday-Friday, 8 a.m.-6 p.m. ET. Outside the U.S. and in
54
+ Canada, please call 1-254-403-7409.<br><br>
55
+
56
+ To opt out from receiving any future marketing-related emails from
57
+ Dqpjnjp, please <a style="color:#666666;" href="http://www.qicweman.cn/?jr=0C8C72B3E1C8648676158B&email=**HIDDEN**@spam.la">
58
+ click here</a>.<br />
59
+ Please be assured that we respect the privacy of our subscribers. To view
60
+ our privacy policy, please <a style="color:#666666;" href="http://www.qicweman.cn/?sj=0C8C72B3E1C8648676158B">
61
+ click here</a>.<br><br>
62
+
63
+ © 2009 Jdqofypy, Inc., 14 Poze Iylqod, 08th Floor, New York, NY 74172.<br></span>
64
+
65
+ </div>
66
+ </td>
67
+ </tr>
68
+ </table>
69
+
70
+
71
+
72
+ </body>
73
+ </html>
@@ -0,0 +1,45 @@
1
+ require File.join(File.dirname(__FILE__), %w[.. .. test_helper])
2
+
3
+ Testy.testing 'String' do
4
+ context 'tokenize' do
5
+ test 'split words' do |t|
6
+ tokens = "hello world".tokenize
7
+ t.check 'words are split',
8
+ :expect => %w[hello world],
9
+ :actual => tokens
10
+ end
11
+
12
+ test 'downcase words' do |t|
13
+ tokens = "Hello World".tokenize
14
+ t.check 'words are downcased',
15
+ :expect => %w[hello world],
16
+ :actual => tokens
17
+ end
18
+
19
+ test 'most symbols are stripped' do |t|
20
+ tokens = "hyphen-ated, under_score!".tokenize
21
+ t.check 'some symbols are left',
22
+ :expect => %w[hyphen-ated under_score],
23
+ :actual => tokens
24
+ end
25
+
26
+ test 'html tags are sanitized' do |t|
27
+ tokens = '<a href="http://example.org">example</a>'.tokenize
28
+ t.check 'only content of tags is retained',
29
+ :expect => %w[example],
30
+ :actual => tokens
31
+ end
32
+
33
+ test 'some dots are ok' do |t|
34
+ tokens = 'example.org rocks. read it...'.tokenize
35
+ t.check 'infix dots are kept',
36
+ :expect => %w[example.org rocks read it],
37
+ :actual => tokens
38
+
39
+ tokens2 = '$1,000,000.00 or $1.000.000,00'.tokenize
40
+ t.check 'infix commas are kept',
41
+ :expect => %w[1,000,000.00 or 1.000.000,00],
42
+ :actual => tokens2
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), %w[.. test_helper])
2
+
3
+ Testy.testing 'Groupie::Group' do
4
+ test 'can be serialized and loaded through YAML' do |t|
5
+ require 'yaml'
6
+
7
+ g = Groupie::Group.new 'group'
8
+ g.add %w[buy flowers]
9
+ g2 = YAML.load(g.to_yaml)
10
+ g2.add %w[buy candy]
11
+ t.check 'default value works for new entries',
12
+ :expect => 1,
13
+ :actual => g2.count('candy')
14
+ end
15
+ end
@@ -0,0 +1,124 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ Testy.testing 'Groupie' do
4
+ test 'classification is certain' do |t|
5
+ g = Groupie.new
6
+ g[:spam].add %w[viagra]
7
+ g[:ham].add %w[flowers]
8
+ classification = g.classify 'viagra'
9
+ t.check 'viagra is',
10
+ :expect => {:spam => 1.0, :ham => 0.0},
11
+ :actual => classification
12
+ end
13
+
14
+ test 'classification is split between two groups' do |t|
15
+ g = Groupie.new
16
+ g[:spam].add %w[buy viagra now]
17
+ g[:ham].add %w[buy flowers for your mom]
18
+ classification = g.classify 'buy'
19
+ t.check 'buy is classified as',
20
+ :expect => {:spam => 0.5, :ham => 0.5},
21
+ :actual => classification
22
+ end
23
+
24
+ test 'classification is weighed more heavy in one group' do |t|
25
+ g = Groupie.new
26
+ g[:spam].add %w[buy viagra now]
27
+ g[:spam].add %w[buy cialis now]
28
+ g[:ham].add %w[buy flowers for your mom]
29
+ t.check 'buy is classified as',
30
+ :expect => {:spam => 2 / 3.0, :ham => 1 / 3.0},
31
+ :actual => g.classify('buy')
32
+ end
33
+
34
+ test 'classification works fine with more than two groups' do |t|
35
+ g = Groupie.new
36
+ g[:weight].add 'pound'
37
+ g[:currency].add 'pound'
38
+ g[:phone_key].add 'pound'
39
+ t.check 'pound is classified as',
40
+ :expect => {:weight => 1/3.0, :currency => 1/3.0, :phone_key => 1/3.0},
41
+ :actual => g.classify('pound')
42
+ end
43
+
44
+ test 'tokenized emails' do |t|
45
+ email = <<-EMAIL
46
+ I noticed your flirt
47
+ If you cannot see the pictures and links below, please click here to view them.
48
+ PHARMACY CLUB | UNSUBSCRIBE | YOUR PRIVACY RIGHTS
49
+ Copyright 2009 Zjfqq, all rights reserved
50
+ Customer Service Dept., 87 Hizq Iveox Street, Isahaylo, VS 25270
51
+ EMAIL
52
+ email2 = <<-EMAIL
53
+ Re: Your subscribe #976589
54
+ Tell a friend · Download latest version See this email as a webpage
55
+ Hello!
56
+ Shipped Privately And Discreetly To Your Door!
57
+ We want to put a great big grin on your face in 2009. You'll be to rejoice all year.
58
+ Unsubscribe · Lost Password · Account Settings · Help · Terms of Service · Privacy
59
+ Ottho Heldringstraat 2, 31719 AZ Amsterdam, The Netherlands
60
+ EMAIL
61
+ email3 = <<-EMAIL
62
+ Re: [ubuntu-art] [Breathe] Network Manager-icons
63
+ Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
64
+ > Hello,
65
+ >
66
+ > I think the notify-osd icons have a completely different style, which
67
+ > is looking great within the notification bubbles, but i doubt it'd
68
+ > look great to have the notify-osd wifi icons in the panel. I think the
69
+ > drawing of the notification- wifi icons should be done afterwards, and
70
+ > if they should be based on those of the icon set, they could be made
71
+ > smoother, and possibly desaturated for some of them, to avoid drawing
72
+ > too much attention from the user when popping up.
73
+ >
74
+ > Cordially, SD.
75
+ EMAIL
76
+ g = Groupie.new
77
+ g[:spam].add email.tokenize
78
+ g[:spam].add email2.tokenize
79
+ g[:ham].add email3.tokenize
80
+ c = g.classify('discreetly')
81
+ t.check 'classification of "discreetly" is spam',
82
+ :expect => true,
83
+ :actual => c[:spam] > c[:ham]
84
+ c2 = g.classify('user')
85
+ t.check 'classification of "user" is ham',
86
+ :expect => true,
87
+ :actual => c2[:ham] > c2[:spam]
88
+ end
89
+
90
+ test 'tokenized html emails' do |t|
91
+ g = Groupie.new
92
+ spam_tokens = File.read(File.join(File.dirname(__FILE__),
93
+ %w[fixtures spam spam.la-44118014.txt])).tokenize
94
+ ham_tokens = File.read(File.join(File.dirname(__FILE__),
95
+ %w[fixtures ham spam.la-44116217.txt])).tokenize
96
+ g[:spam].add spam_tokens
97
+ g[:ham].add ham_tokens
98
+
99
+ c = g.classify 'user'
100
+ t.check 'classification of the word "user" is ham',
101
+ :expect => true,
102
+ :actual => (c[:ham] > c[:spam])
103
+
104
+ c = g.classify_text(spam_tokens)
105
+ t.check 'classification of spam email is spam',
106
+ :expect => true,
107
+ :actual => (c[:spam] > c[:ham])
108
+ end
109
+
110
+ test 'classify a text' do |t|
111
+ g = Groupie.new
112
+ g[:spam].add %w[buy viagra now to grow fast]
113
+ g[:spam].add %w[buy cialis on our website]
114
+ g[:ham].add %w[buy flowers for your mom]
115
+ result = g.classify_text "Grow flowers to sell on our website".tokenize
116
+ t.check 'classification of a spammy text is spam',
117
+ :expect => true,
118
+ :actual => result[:spam] > result[:ham]
119
+ result2 = g.classify_text "Grow flowers to give to your mom".tokenize
120
+ t.check 'classification of a non-spammy text is ham',
121
+ :expect => true,
122
+ :actual => result2[:ham] > result2[:spam]
123
+ end
124
+ end
@@ -0,0 +1,3 @@
1
+ require 'rubygems'
2
+ require 'testy'
3
+ require 'lib/groupie'
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: groupie
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Wes Oldenbeuving
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-07-25 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: testy
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: Group and classify text based on likelyhood of being included in a text of a specific category
36
+ email: narnach@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ files:
44
+ - .document
45
+ - LICENSE
46
+ - Rakefile
47
+ - VERSION
48
+ - groupie.gemspec
49
+ - lib/groupie.rb
50
+ - lib/groupie/core_ext/string.rb
51
+ - lib/groupie/group.rb
52
+ - readme.rdoc
53
+ - test/fixtures/ham/spam.la-44116217.txt
54
+ - test/fixtures/spam/spam.la-44118014.txt
55
+ - test/groupie/core_ext/string_test.rb
56
+ - test/groupie/group_test.rb
57
+ - test/groupie_test.rb
58
+ - test/test_helper.rb
59
+ has_rdoc: true
60
+ homepage: http://github.com/Narnach/groupie
61
+ licenses: []
62
+
63
+ post_install_message:
64
+ rdoc_options:
65
+ - --charset=UTF-8
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ hash: 3
74
+ segments:
75
+ - 0
76
+ version: "0"
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.3.7
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Group and classify text
93
+ test_files:
94
+ - test/groupie/core_ext/string_test.rb
95
+ - test/groupie/group_test.rb
96
+ - test/groupie_test.rb
97
+ - test/test_helper.rb