groupie 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Wes Oldenbeuving
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "groupie"
8
+ gem.summary = %Q{Group and classify text}
9
+ gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
10
+ gem.email = "narnach@gmail.com"
11
+ gem.homepage = "http://github.com/Narnach/groupie"
12
+ gem.authors = ["Wes Oldenbeuving"]
13
+ gem.add_development_dependency "testy", ">= 0"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "groupie #{version}"
51
+ rdoc.rdoc_files.include('readme*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{groupie}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Wes Oldenbeuving"]
12
+ s.date = %q{2010-07-25}
13
+ s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
14
+ s.email = %q{narnach@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ "LICENSE",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "groupie.gemspec",
24
+ "lib/groupie.rb",
25
+ "lib/groupie/core_ext/string.rb",
26
+ "lib/groupie/group.rb",
27
+ "readme.rdoc",
28
+ "test/fixtures/ham/spam.la-44116217.txt",
29
+ "test/fixtures/spam/spam.la-44118014.txt",
30
+ "test/groupie/core_ext/string_test.rb",
31
+ "test/groupie/group_test.rb",
32
+ "test/groupie_test.rb",
33
+ "test/test_helper.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/Narnach/groupie}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{Group and classify text}
40
+ s.test_files = [
41
+ "test/groupie/core_ext/string_test.rb",
42
+ "test/groupie/group_test.rb",
43
+ "test/groupie_test.rb",
44
+ "test/test_helper.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
+ s.add_development_dependency(%q<testy>, [">= 0"])
53
+ else
54
+ s.add_dependency(%q<testy>, [">= 0"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<testy>, [">= 0"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,51 @@
1
+ lib_dir = File.expand_path(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
+ require 'groupie/group'
4
+ require 'groupie/core_ext/string'
5
+
6
+ class Groupie
7
+ def initialize
8
+ @groups = {}
9
+ end
10
+
11
+ def [](group)
12
+ @groups[group] ||= Group.new(group)
13
+ end
14
+
15
+ def classify(entry)
16
+ results = {}
17
+ total_count = @groups.inject(0) do |sum, name_group|
18
+ group = name_group.last
19
+ sum + group.count(entry)
20
+ end
21
+ return results if 0 == total_count
22
+
23
+ @groups.each do |name, group|
24
+ count = group.count(entry)
25
+ results[name] = count > 0 ? count.to_f / total_count : 0.0
26
+ end
27
+ return results
28
+ end
29
+
30
+ # Classify a text by taking the average of all word classifications.
31
+ def classify_text(words)
32
+ group_score_sums = words.inject({}) do |results, word|
33
+ word_results = classify(word)
34
+ results.merge(word_results) do |key, old, new|
35
+ old + new
36
+ end
37
+ end
38
+
39
+ words_count = words.size.to_f
40
+ averages={}
41
+ group_score_sums.each do |group, sum|
42
+ averages[group] = sum / words_count
43
+ end
44
+
45
+ averages
46
+ end
47
+
48
+ def self.version
49
+ File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
50
+ end
51
+ end
@@ -0,0 +1,17 @@
1
+ class Groupie
2
+ module CoreExt
3
+ module String
4
+ def tokenize
5
+ downcase.
6
+ gsub(/\s/," ").
7
+ gsub(/[$']/,'').
8
+ gsub(/<[^>]+?>|[^\w -.,]/,'').
9
+ split(" ").map {|str| str.gsub(/[,.]+\Z/,'')}
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ class String
16
+ include Groupie::CoreExt::String
17
+ end
@@ -0,0 +1,32 @@
1
+ class Groupie
2
+ class Group
3
+ def initialize(name)
4
+ @name = name
5
+ @word_counts = {}
6
+ end
7
+
8
+ def words
9
+ @word_counts.keys
10
+ end
11
+
12
+ # Add new words to the group.
13
+ def add(*words)
14
+ words.flatten.each do |word|
15
+ add_word(word)
16
+ end
17
+ nil
18
+ end
19
+
20
+ # Return the count for a specific +word+.
21
+ def count(word)
22
+ @word_counts[word] || 0
23
+ end
24
+
25
+ # Add a single word and count it.
26
+ def add_word(word)
27
+ @word_counts[word] ||= 0
28
+ @word_counts[word] += 1
29
+ end
30
+ private :add_word
31
+ end
32
+ end
@@ -0,0 +1,28 @@
1
+ = Groupie
2
+
3
+ Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
4
+
5
+ The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
6
+
7
+ == Goals
8
+
9
+ Groupie is a 'fun' project that has the following goals, in descending order of importance:
10
+ * Have fun playing with code
11
+ * Play with Bayesian-like (spam) filtering
12
+ * Check out the Testy BDD framework. It's pretty good for 60 lines of code!
13
+
14
+ == Current functionality
15
+
16
+ Current funcionality includes:
17
+ * Tokenize an input text to prepare it for grouping.
18
+ * Strip XML and HTML tag.
19
+ * Keep certain infix characters, such as period and comma.
20
+ * Add texts (as an Array of Strings) to any number of groups.
21
+ * Classify a single word to check the likelihood it belongs to each group.
22
+ * Do classification for complete (tokenized) texts.
23
+
24
+ == License
25
+
26
+ As always, the code is licensed under the MIT license.
27
+
28
+ Wes Oldenbeuving
@@ -0,0 +1,79 @@
1
+ From **HIDDEN**@lists.ubuntu.com Sun May 31 10:22:46 2009
2
+ Return-Path: <**HIDDEN**@lists.ubuntu.com>
3
+ X-Original-To: **HIDDEN**@spam.la
4
+ Delivered-To: **HIDDEN**@speedo.dreamhost.com
5
+ Received: from chlorine.canonical.com (chlorine.canonical.com [91.189.94.204])
6
+ by speedo.dreamhost.com (Postfix) with ESMTP id 18A4F145730
7
+ for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:22:47 -0700 (PDT)
8
+ Received: from localhost ([127.0.0.1] helo=chlorine.canonical.com)
9
+ by chlorine.canonical.com with esmtp (Exim 4.60)
10
+ (envelope-from <**HIDDEN**@lists.ubuntu.com>)
11
+ id 1MAoKV-0000Uv-RB; Sun, 31 May 2009 17:56:15 +0100
12
+ Received: from smtp104.mail.ukl.yahoo.com ([77.238.184.36])
13
+ by chlorine.canonical.com with smtp (Exim 4.60)
14
+ (envelope-from <**HIDDEN**@yahoo.de>) id 1MAoKO-0000T5-0J
15
+ for **HIDDEN**@lists.ubuntu.com; Sun, 31 May 2009 17:56:08 +0100
16
+ Received: (qmail 95693 invoked from network); 31 May 2009 16:56:07 -0000
17
+ Received: from unknown (HELO ?192.168.1.33?) **HIDDEN**@88.5.92.30 with plain)
18
+ by smtp104.mail.ukl.yahoo.com with SMTP; 31 May 2009 16:56:07 -0000
19
+ X-Yahoo-SMTP: omQsrMiswBC_IZdIQhRgQAA3Gn6tTTc-
20
+ X-YMail-OSG: e1ihGm4VM1ljqtG.6IPGOps5aG8IYZJEPQLptGPSxphH174zk4rRTWYQJmj9MMc2nJwZjNEqUnYAjErWKypElvLWu0n.v8baMMlcOOELQK2IZfFaV5Ij3HUpUDWRbd0n6PCV5iFLHlyruq5CSGsiZvfME6HpngIO0RuAcin3rePXdzWpmPnTlZwuC3qjSE9N8wC4pdBdwfmYHy4EKSKFRCXUNzdy9DPgfwqrjiCTP_tqaWmpeUOqA2Os13l0j5d6acIpgo9DcW8P_1ENNVGjJ2Lk4XbZ0oc51M_BJ2n6DHMxoazT
21
+ X-Yahoo-Newman-Property: ymail-3
22
+ From: Oliver Scholtz 1 <**HIDDEN**@yahoo.de>
23
+ To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
24
+ In-Reply-To: <**HIDDEN**@mail.gmail.com>
25
+ References: <**HIDDEN**@yahoo.com>
26
+ <**HIDDEN**@jws141-laptop> <**HIDDEN**@yahoo.com>
27
+ <**HIDDEN**@isabel-desktop>
28
+ <**HIDDEN**@web95411.mail.in2.yahoo.com>
29
+ <**HIDDEN**@dani-desktop> <**HIDDEN**@yahoo.com>
30
+ <**HIDDEN**@mail.gmail.com>
31
+ <**HIDDEN**@mail.gmail.com>
32
+ Date: Sun, 31 May 2009 18:56:05 +0200
33
+ Message-Id: <**HIDDEN**@oliver-ubuntu>
34
+ Mime-Version: 1.0
35
+ X-Mailer: Evolution 2.26.1
36
+ Subject: Re: [ubuntu-art] [Breathe] Network Manager-icons
37
+ X-BeenThere: **HIDDEN**@lists.ubuntu.com
38
+ X-Mailman-Version: 2.1.8
39
+ Precedence: list
40
+ Reply-To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
41
+ List-Id: Discussion on Ubuntu artwork <ubuntu-art.lists.ubuntu.com>
42
+ List-Unsubscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
43
+ <**HIDDEN**@lists.ubuntu.com?subject=unsubscribe>
44
+ List-Archive: <https://lists.ubuntu.com/archives/ubuntu-art>
45
+ List-Post: <**HIDDEN**@lists.ubuntu.com>
46
+ List-Help: <**HIDDEN**@lists.ubuntu.com?subject=help>
47
+ List-Subscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
48
+ <**HIDDEN**@lists.ubuntu.com?subject=subscribe>
49
+ Content-Type: text/plain; charset="us-ascii"
50
+ Content-Transfer-Encoding: 7bit
51
+ Sender: **HIDDEN**@lists.ubuntu.com
52
+ Errors-To: **HIDDEN**@lists.ubuntu.com
53
+
54
+
55
+
56
+ Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
57
+ > Hello,
58
+ >
59
+ > I think the notify-osd icons have a completely different style, which
60
+ > is looking great within the notification bubbles, but i doubt it'd
61
+ > look great to have the notify-osd wifi icons in the panel. I think the
62
+ > drawing of the notification- wifi icons should be done afterwards, and
63
+ > if they should be based on those of the icon set, they could be made
64
+ > smoother, and possibly desaturated for some of them, to avoid drawing
65
+ > too much attention from the user when popping up.
66
+ >
67
+ > Cordially, SD.
68
+
69
+ +1
70
+ ---
71
+ And Mac ... much better! Maybe 22 and 16 without pale. ;)
72
+
73
+ Oliver
74
+
75
+
76
+ --
77
+ ubuntu-art mailing list
78
+ **HIDDEN**@lists.ubuntu.com
79
+ https://lists.ubuntu.com/mailman/listinfo/ubuntu-art
@@ -0,0 +1,73 @@
1
+ From **HIDDEN**@manpoints.net Sun May 31 10:34:01 2009
2
+ Return-Path: <**HIDDEN**@manpoints.net>
3
+ X-Original-To: **HIDDEN**@spam.la
4
+ Delivered-To: **HIDDEN**@speedo.dreamhost.com
5
+ Received: from 201-40-49-243.bsace702.dsl.brasiltelecom.net.br (201-40-49-243.bsace702.dsl.brasiltelecom.net.br [201.40.49.243])
6
+ by speedo.dreamhost.com (Postfix) with ESMTP id 4BDC714572F
7
+ for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:33:56 -0700 (PDT)
8
+ Message-Id: <**HIDDEN**@201-40-49-243.bsace702.dsl.brasiltelecom.net.br>
9
+ From: "Leskovar L. Golda" <**HIDDEN**@manpoints.net>
10
+ To: **HIDDEN**@spam.la
11
+ Subject: My official mail blocked
12
+ Content-Type: text/html; charset="iso-8859-1"
13
+ Content-Transfer-Encoding: 7bit
14
+ MIME-Version: 1.0
15
+ Date: Sun, 31 May 2009 10:33:56 -0700 (PDT)
16
+
17
+
18
+
19
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
20
+ <html>
21
+ <head>
22
+ <title></title>
23
+ <meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
24
+ </head>
25
+ <body bgcolor="#FFFFFF" topmargin="0" leftmargin="0" marginwidth="0" marginheight="0">
26
+
27
+
28
+ <table width="646" cellspacing="0" border="0" align="center" cellpadding="4">
29
+ <tr>
30
+ <td align="center"><font face="Arial" size="1" color="#000000">If you cannot see
31
+ the pictures and links below, please <a href="http://www.qicweman.cn/?abo=0C8C72B3E1C8648676158B">
32
+ click here</a> to view them.<br></font></td>
33
+ </tr>
34
+
35
+ </table>
36
+
37
+ <table width="742" cellspacing="0" border="0" align="center" cellpadding="0">
38
+
39
+ <tr valign="top">
40
+ <td width="475" style="border-left:1px solid #371E96;">
41
+ <a href="http://www.qicweman.cn/?ex=0C8C72B3E1C8648676158B">
42
+ <img alt="click to see the full version" src="http://www.qicweman.cn/d.jpg" style="border-width: 0px" /></a></td>
43
+
44
+ </tr>
45
+
46
+ <tr>
47
+ <td><br>
48
+
49
+ <div style="padding:10px;">
50
+
51
+ <span style="font-size:10px;color:#666666;font-family:arial;">You may also
52
+ respond to this email and subscribe to <i>Hizqru</i> by calling
53
+ 1-085-417-9085, Monday-Friday, 8 a.m.-6 p.m. ET. Outside the U.S. and in
54
+ Canada, please call 1-254-403-7409.<br><br>
55
+
56
+ To opt out from receiving any future marketing-related emails from
57
+ Dqpjnjp, please <a style="color:#666666;" href="http://www.qicweman.cn/?jr=0C8C72B3E1C8648676158B&email=**HIDDEN**@spam.la">
58
+ click here</a>.<br />
59
+ Please be assured that we respect the privacy of our subscribers. To view
60
+ our privacy policy, please <a style="color:#666666;" href="http://www.qicweman.cn/?sj=0C8C72B3E1C8648676158B">
61
+ click here</a>.<br><br>
62
+
63
+ © 2009 Jdqofypy, Inc., 14 Poze Iylqod, 08th Floor, New York, NY 74172.<br></span>
64
+
65
+ </div>
66
+ </td>
67
+ </tr>
68
+ </table>
69
+
70
+
71
+
72
+ </body>
73
+ </html>
@@ -0,0 +1,45 @@
1
+ require File.join(File.dirname(__FILE__), %w[.. .. test_helper])
2
+
3
+ Testy.testing 'String' do
4
+ context 'tokenize' do
5
+ test 'split words' do |t|
6
+ tokens = "hello world".tokenize
7
+ t.check 'words are split',
8
+ :expect => %w[hello world],
9
+ :actual => tokens
10
+ end
11
+
12
+ test 'downcase words' do |t|
13
+ tokens = "Hello World".tokenize
14
+ t.check 'words are downcased',
15
+ :expect => %w[hello world],
16
+ :actual => tokens
17
+ end
18
+
19
+ test 'most symbols are stripped' do |t|
20
+ tokens = "hyphen-ated, under_score!".tokenize
21
+ t.check 'some symbols are left',
22
+ :expect => %w[hyphen-ated under_score],
23
+ :actual => tokens
24
+ end
25
+
26
+ test 'html tags are sanitized' do |t|
27
+ tokens = '<a href="http://example.org">example</a>'.tokenize
28
+ t.check 'only content of tags is retained',
29
+ :expect => %w[example],
30
+ :actual => tokens
31
+ end
32
+
33
+ test 'some dots are ok' do |t|
34
+ tokens = 'example.org rocks. read it...'.tokenize
35
+ t.check 'infix dots are kept',
36
+ :expect => %w[example.org rocks read it],
37
+ :actual => tokens
38
+
39
+ tokens2 = '$1,000,000.00 or $1.000.000,00'.tokenize
40
+ t.check 'infix commas are kept',
41
+ :expect => %w[1,000,000.00 or 1.000.000,00],
42
+ :actual => tokens2
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), %w[.. test_helper])
2
+
3
+ Testy.testing 'Groupie::Group' do
4
+ test 'can be serialized and loaded through YAML' do |t|
5
+ require 'yaml'
6
+
7
+ g = Groupie::Group.new 'group'
8
+ g.add %w[buy flowers]
9
+ g2 = YAML.load(g.to_yaml)
10
+ g2.add %w[buy candy]
11
+ t.check 'default value works for new entries',
12
+ :expect => 1,
13
+ :actual => g2.count('candy')
14
+ end
15
+ end
@@ -0,0 +1,124 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ Testy.testing 'Groupie' do
4
+ test 'classification is certain' do |t|
5
+ g = Groupie.new
6
+ g[:spam].add %w[viagra]
7
+ g[:ham].add %w[flowers]
8
+ classification = g.classify 'viagra'
9
+ t.check 'viagra is',
10
+ :expect => {:spam => 1.0, :ham => 0.0},
11
+ :actual => classification
12
+ end
13
+
14
+ test 'classification is split between two groups' do |t|
15
+ g = Groupie.new
16
+ g[:spam].add %w[buy viagra now]
17
+ g[:ham].add %w[buy flowers for your mom]
18
+ classification = g.classify 'buy'
19
+ t.check 'buy is classified as',
20
+ :expect => {:spam => 0.5, :ham => 0.5},
21
+ :actual => classification
22
+ end
23
+
24
+ test 'classification is weighed more heavy in one group' do |t|
25
+ g = Groupie.new
26
+ g[:spam].add %w[buy viagra now]
27
+ g[:spam].add %w[buy cialis now]
28
+ g[:ham].add %w[buy flowers for your mom]
29
+ t.check 'buy is classified as',
30
+ :expect => {:spam => 2 / 3.0, :ham => 1 / 3.0},
31
+ :actual => g.classify('buy')
32
+ end
33
+
34
+ test 'classification works fine with more than two groups' do |t|
35
+ g = Groupie.new
36
+ g[:weight].add 'pound'
37
+ g[:currency].add 'pound'
38
+ g[:phone_key].add 'pound'
39
+ t.check 'pound is classified as',
40
+ :expect => {:weight => 1/3.0, :currency => 1/3.0, :phone_key => 1/3.0},
41
+ :actual => g.classify('pound')
42
+ end
43
+
44
+ test 'tokenized emails' do |t|
45
+ email = <<-EMAIL
46
+ I noticed your flirt
47
+ If you cannot see the pictures and links below, please click here to view them.
48
+ PHARMACY CLUB | UNSUBSCRIBE | YOUR PRIVACY RIGHTS
49
+ Copyright 2009 Zjfqq, all rights reserved
50
+ Customer Service Dept., 87 Hizq Iveox Street, Isahaylo, VS 25270
51
+ EMAIL
52
+ email2 = <<-EMAIL
53
+ Re: Your subscribe #976589
54
+ Tell a friend · Download latest version See this email as a webpage
55
+ Hello!
56
+ Shipped Privately And Discreetly To Your Door!
57
+ We want to put a great big grin on your face in 2009. You'll be to rejoice all year.
58
+ Unsubscribe · Lost Password · Account Settings · Help · Terms of Service · Privacy
59
+ Ottho Heldringstraat 2, 31719 AZ Amsterdam, The Netherlands
60
+ EMAIL
61
+ email3 = <<-EMAIL
62
+ Re: [ubuntu-art] [Breathe] Network Manager-icons
63
+ Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
64
+ > Hello,
65
+ >
66
+ > I think the notify-osd icons have a completely different style, which
67
+ > is looking great within the notification bubbles, but i doubt it'd
68
+ > look great to have the notify-osd wifi icons in the panel. I think the
69
+ > drawing of the notification- wifi icons should be done afterwards, and
70
+ > if they should be based on those of the icon set, they could be made
71
+ > smoother, and possibly desaturated for some of them, to avoid drawing
72
+ > too much attention from the user when popping up.
73
+ >
74
+ > Cordially, SD.
75
+ EMAIL
76
+ g = Groupie.new
77
+ g[:spam].add email.tokenize
78
+ g[:spam].add email2.tokenize
79
+ g[:ham].add email3.tokenize
80
+ c = g.classify('discreetly')
81
+ t.check 'classification of "discreetly" is spam',
82
+ :expect => true,
83
+ :actual => c[:spam] > c[:ham]
84
+ c2 = g.classify('user')
85
+ t.check 'classification of "user" is ham',
86
+ :expect => true,
87
+ :actual => c2[:ham] > c2[:spam]
88
+ end
89
+
90
+ test 'tokenized html emails' do |t|
91
+ g = Groupie.new
92
+ spam_tokens = File.read(File.join(File.dirname(__FILE__),
93
+ %w[fixtures spam spam.la-44118014.txt])).tokenize
94
+ ham_tokens = File.read(File.join(File.dirname(__FILE__),
95
+ %w[fixtures ham spam.la-44116217.txt])).tokenize
96
+ g[:spam].add spam_tokens
97
+ g[:ham].add ham_tokens
98
+
99
+ c = g.classify 'user'
100
+ t.check 'classification of the word "user" is ham',
101
+ :expect => true,
102
+ :actual => (c[:ham] > c[:spam])
103
+
104
+ c = g.classify_text(spam_tokens)
105
+ t.check 'classification of spam email is spam',
106
+ :expect => true,
107
+ :actual => (c[:spam] > c[:ham])
108
+ end
109
+
110
+ test 'classify a text' do |t|
111
+ g = Groupie.new
112
+ g[:spam].add %w[buy viagra now to grow fast]
113
+ g[:spam].add %w[buy cialis on our website]
114
+ g[:ham].add %w[buy flowers for your mom]
115
+ result = g.classify_text "Grow flowers to sell on our website".tokenize
116
+ t.check 'classification of a spammy text is spam',
117
+ :expect => true,
118
+ :actual => result[:spam] > result[:ham]
119
+ result2 = g.classify_text "Grow flowers to give to your mom".tokenize
120
+ t.check 'classification of a non-spammy text is ham',
121
+ :expect => true,
122
+ :actual => result2[:ham] > result2[:spam]
123
+ end
124
+ end
@@ -0,0 +1,3 @@
1
+ require 'rubygems'
2
+ require 'testy'
3
+ require 'lib/groupie'
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: groupie
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Wes Oldenbeuving
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-07-25 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: testy
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: Group and classify text based on likelyhood of being included in a text of a specific category
36
+ email: narnach@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ files:
44
+ - .document
45
+ - LICENSE
46
+ - Rakefile
47
+ - VERSION
48
+ - groupie.gemspec
49
+ - lib/groupie.rb
50
+ - lib/groupie/core_ext/string.rb
51
+ - lib/groupie/group.rb
52
+ - readme.rdoc
53
+ - test/fixtures/ham/spam.la-44116217.txt
54
+ - test/fixtures/spam/spam.la-44118014.txt
55
+ - test/groupie/core_ext/string_test.rb
56
+ - test/groupie/group_test.rb
57
+ - test/groupie_test.rb
58
+ - test/test_helper.rb
59
+ has_rdoc: true
60
+ homepage: http://github.com/Narnach/groupie
61
+ licenses: []
62
+
63
+ post_install_message:
64
+ rdoc_options:
65
+ - --charset=UTF-8
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ hash: 3
74
+ segments:
75
+ - 0
76
+ version: "0"
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.3.7
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Group and classify text
93
+ test_files:
94
+ - test/groupie/core_ext/string_test.rb
95
+ - test/groupie/group_test.rb
96
+ - test/groupie_test.rb
97
+ - test/test_helper.rb