groupie 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/LICENSE +20 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/groupie.gemspec +60 -0
- data/lib/groupie.rb +51 -0
- data/lib/groupie/core_ext/string.rb +17 -0
- data/lib/groupie/group.rb +32 -0
- data/readme.rdoc +28 -0
- data/test/fixtures/ham/spam.la-44116217.txt +79 -0
- data/test/fixtures/spam/spam.la-44118014.txt +73 -0
- data/test/groupie/core_ext/string_test.rb +45 -0
- data/test/groupie/group_test.rb +15 -0
- data/test/groupie_test.rb +124 -0
- data/test/test_helper.rb +3 -0
- metadata +97 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Wes Oldenbeuving
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "groupie"
|
8
|
+
gem.summary = %Q{Group and classify text}
|
9
|
+
gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
10
|
+
gem.email = "narnach@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/Narnach/groupie"
|
12
|
+
gem.authors = ["Wes Oldenbeuving"]
|
13
|
+
gem.add_development_dependency "testy", ">= 0"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
rescue LoadError
|
36
|
+
task :rcov do
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test => :check_dependencies
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "groupie #{version}"
|
51
|
+
rdoc.rdoc_files.include('readme*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/groupie.gemspec
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{groupie}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Wes Oldenbeuving"]
|
12
|
+
s.date = %q{2010-07-25}
|
13
|
+
s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
14
|
+
s.email = %q{narnach@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".document",
|
20
|
+
"LICENSE",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"groupie.gemspec",
|
24
|
+
"lib/groupie.rb",
|
25
|
+
"lib/groupie/core_ext/string.rb",
|
26
|
+
"lib/groupie/group.rb",
|
27
|
+
"readme.rdoc",
|
28
|
+
"test/fixtures/ham/spam.la-44116217.txt",
|
29
|
+
"test/fixtures/spam/spam.la-44118014.txt",
|
30
|
+
"test/groupie/core_ext/string_test.rb",
|
31
|
+
"test/groupie/group_test.rb",
|
32
|
+
"test/groupie_test.rb",
|
33
|
+
"test/test_helper.rb"
|
34
|
+
]
|
35
|
+
s.homepage = %q{http://github.com/Narnach/groupie}
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.7}
|
39
|
+
s.summary = %q{Group and classify text}
|
40
|
+
s.test_files = [
|
41
|
+
"test/groupie/core_ext/string_test.rb",
|
42
|
+
"test/groupie/group_test.rb",
|
43
|
+
"test/groupie_test.rb",
|
44
|
+
"test/test_helper.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_development_dependency(%q<testy>, [">= 0"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<testy>, [">= 0"])
|
55
|
+
end
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<testy>, [">= 0"])
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
data/lib/groupie.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
lib_dir = File.expand_path(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
|
3
|
+
require 'groupie/group'
|
4
|
+
require 'groupie/core_ext/string'
|
5
|
+
|
6
|
+
class Groupie
|
7
|
+
def initialize
|
8
|
+
@groups = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def [](group)
|
12
|
+
@groups[group] ||= Group.new(group)
|
13
|
+
end
|
14
|
+
|
15
|
+
def classify(entry)
|
16
|
+
results = {}
|
17
|
+
total_count = @groups.inject(0) do |sum, name_group|
|
18
|
+
group = name_group.last
|
19
|
+
sum + group.count(entry)
|
20
|
+
end
|
21
|
+
return results if 0 == total_count
|
22
|
+
|
23
|
+
@groups.each do |name, group|
|
24
|
+
count = group.count(entry)
|
25
|
+
results[name] = count > 0 ? count.to_f / total_count : 0.0
|
26
|
+
end
|
27
|
+
return results
|
28
|
+
end
|
29
|
+
|
30
|
+
# Classify a text by taking the average of all word classifications.
|
31
|
+
def classify_text(words)
|
32
|
+
group_score_sums = words.inject({}) do |results, word|
|
33
|
+
word_results = classify(word)
|
34
|
+
results.merge(word_results) do |key, old, new|
|
35
|
+
old + new
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
words_count = words.size.to_f
|
40
|
+
averages={}
|
41
|
+
group_score_sums.each do |group, sum|
|
42
|
+
averages[group] = sum / words_count
|
43
|
+
end
|
44
|
+
|
45
|
+
averages
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.version
|
49
|
+
File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Groupie
|
2
|
+
module CoreExt
|
3
|
+
module String
|
4
|
+
def tokenize
|
5
|
+
downcase.
|
6
|
+
gsub(/\s/," ").
|
7
|
+
gsub(/[$']/,'').
|
8
|
+
gsub(/<[^>]+?>|[^\w -.,]/,'').
|
9
|
+
split(" ").map {|str| str.gsub(/[,.]+\Z/,'')}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class String
|
16
|
+
include Groupie::CoreExt::String
|
17
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
class Groupie
|
2
|
+
class Group
|
3
|
+
def initialize(name)
|
4
|
+
@name = name
|
5
|
+
@word_counts = {}
|
6
|
+
end
|
7
|
+
|
8
|
+
def words
|
9
|
+
@word_counts.keys
|
10
|
+
end
|
11
|
+
|
12
|
+
# Add new words to the group.
|
13
|
+
def add(*words)
|
14
|
+
words.flatten.each do |word|
|
15
|
+
add_word(word)
|
16
|
+
end
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# Return the count for a specific +word+.
|
21
|
+
def count(word)
|
22
|
+
@word_counts[word] || 0
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add a single word and count it.
|
26
|
+
def add_word(word)
|
27
|
+
@word_counts[word] ||= 0
|
28
|
+
@word_counts[word] += 1
|
29
|
+
end
|
30
|
+
private :add_word
|
31
|
+
end
|
32
|
+
end
|
data/readme.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= Groupie
|
2
|
+
|
3
|
+
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
4
|
+
|
5
|
+
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
|
+
|
7
|
+
== Goals
|
8
|
+
|
9
|
+
Groupie is a 'fun' project that has the following goals, in descending order of importance:
|
10
|
+
* Have fun playing with code
|
11
|
+
* Play with Bayesian-like (spam) filtering
|
12
|
+
* Check out the Testy BDD framework. It's pretty good for 60 lines of code!
|
13
|
+
|
14
|
+
== Current functionality
|
15
|
+
|
16
|
+
Current funcionality includes:
|
17
|
+
* Tokenize an input text to prepare it for grouping.
|
18
|
+
* Strip XML and HTML tag.
|
19
|
+
* Keep certain infix characters, such as period and comma.
|
20
|
+
* Add texts (as an Array of Strings) to any number of groups.
|
21
|
+
* Classify a single word to check the likelihood it belongs to each group.
|
22
|
+
* Do classification for complete (tokenized) texts.
|
23
|
+
|
24
|
+
== License
|
25
|
+
|
26
|
+
As always, the code is licensed under the MIT license.
|
27
|
+
|
28
|
+
Wes Oldenbeuving
|
@@ -0,0 +1,79 @@
|
|
1
|
+
From **HIDDEN**@lists.ubuntu.com Sun May 31 10:22:46 2009
|
2
|
+
Return-Path: <**HIDDEN**@lists.ubuntu.com>
|
3
|
+
X-Original-To: **HIDDEN**@spam.la
|
4
|
+
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
+
Received: from chlorine.canonical.com (chlorine.canonical.com [91.189.94.204])
|
6
|
+
by speedo.dreamhost.com (Postfix) with ESMTP id 18A4F145730
|
7
|
+
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:22:47 -0700 (PDT)
|
8
|
+
Received: from localhost ([127.0.0.1] helo=chlorine.canonical.com)
|
9
|
+
by chlorine.canonical.com with esmtp (Exim 4.60)
|
10
|
+
(envelope-from <**HIDDEN**@lists.ubuntu.com>)
|
11
|
+
id 1MAoKV-0000Uv-RB; Sun, 31 May 2009 17:56:15 +0100
|
12
|
+
Received: from smtp104.mail.ukl.yahoo.com ([77.238.184.36])
|
13
|
+
by chlorine.canonical.com with smtp (Exim 4.60)
|
14
|
+
(envelope-from <**HIDDEN**@yahoo.de>) id 1MAoKO-0000T5-0J
|
15
|
+
for **HIDDEN**@lists.ubuntu.com; Sun, 31 May 2009 17:56:08 +0100
|
16
|
+
Received: (qmail 95693 invoked from network); 31 May 2009 16:56:07 -0000
|
17
|
+
Received: from unknown (HELO ?192.168.1.33?) **HIDDEN**@88.5.92.30 with plain)
|
18
|
+
by smtp104.mail.ukl.yahoo.com with SMTP; 31 May 2009 16:56:07 -0000
|
19
|
+
X-Yahoo-SMTP: omQsrMiswBC_IZdIQhRgQAA3Gn6tTTc-
|
20
|
+
X-YMail-OSG: e1ihGm4VM1ljqtG.6IPGOps5aG8IYZJEPQLptGPSxphH174zk4rRTWYQJmj9MMc2nJwZjNEqUnYAjErWKypElvLWu0n.v8baMMlcOOELQK2IZfFaV5Ij3HUpUDWRbd0n6PCV5iFLHlyruq5CSGsiZvfME6HpngIO0RuAcin3rePXdzWpmPnTlZwuC3qjSE9N8wC4pdBdwfmYHy4EKSKFRCXUNzdy9DPgfwqrjiCTP_tqaWmpeUOqA2Os13l0j5d6acIpgo9DcW8P_1ENNVGjJ2Lk4XbZ0oc51M_BJ2n6DHMxoazT
|
21
|
+
X-Yahoo-Newman-Property: ymail-3
|
22
|
+
From: Oliver Scholtz 1 <**HIDDEN**@yahoo.de>
|
23
|
+
To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
24
|
+
In-Reply-To: <**HIDDEN**@mail.gmail.com>
|
25
|
+
References: <**HIDDEN**@yahoo.com>
|
26
|
+
<**HIDDEN**@jws141-laptop> <**HIDDEN**@yahoo.com>
|
27
|
+
<**HIDDEN**@isabel-desktop>
|
28
|
+
<**HIDDEN**@web95411.mail.in2.yahoo.com>
|
29
|
+
<**HIDDEN**@dani-desktop> <**HIDDEN**@yahoo.com>
|
30
|
+
<**HIDDEN**@mail.gmail.com>
|
31
|
+
<**HIDDEN**@mail.gmail.com>
|
32
|
+
Date: Sun, 31 May 2009 18:56:05 +0200
|
33
|
+
Message-Id: <**HIDDEN**@oliver-ubuntu>
|
34
|
+
Mime-Version: 1.0
|
35
|
+
X-Mailer: Evolution 2.26.1
|
36
|
+
Subject: Re: [ubuntu-art] [Breathe] Network Manager-icons
|
37
|
+
X-BeenThere: **HIDDEN**@lists.ubuntu.com
|
38
|
+
X-Mailman-Version: 2.1.8
|
39
|
+
Precedence: list
|
40
|
+
Reply-To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
41
|
+
List-Id: Discussion on Ubuntu artwork <ubuntu-art.lists.ubuntu.com>
|
42
|
+
List-Unsubscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
43
|
+
<**HIDDEN**@lists.ubuntu.com?subject=unsubscribe>
|
44
|
+
List-Archive: <https://lists.ubuntu.com/archives/ubuntu-art>
|
45
|
+
List-Post: <**HIDDEN**@lists.ubuntu.com>
|
46
|
+
List-Help: <**HIDDEN**@lists.ubuntu.com?subject=help>
|
47
|
+
List-Subscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
48
|
+
<**HIDDEN**@lists.ubuntu.com?subject=subscribe>
|
49
|
+
Content-Type: text/plain; charset="us-ascii"
|
50
|
+
Content-Transfer-Encoding: 7bit
|
51
|
+
Sender: **HIDDEN**@lists.ubuntu.com
|
52
|
+
Errors-To: **HIDDEN**@lists.ubuntu.com
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
57
|
+
> Hello,
|
58
|
+
>
|
59
|
+
> I think the notify-osd icons have a completely different style, which
|
60
|
+
> is looking great within the notification bubbles, but i doubt it'd
|
61
|
+
> look great to have the notify-osd wifi icons in the panel. I think the
|
62
|
+
> drawing of the notification- wifi icons should be done afterwards, and
|
63
|
+
> if they should be based on those of the icon set, they could be made
|
64
|
+
> smoother, and possibly desaturated for some of them, to avoid drawing
|
65
|
+
> too much attention from the user when popping up.
|
66
|
+
>
|
67
|
+
> Cordially, SD.
|
68
|
+
|
69
|
+
+1
|
70
|
+
---
|
71
|
+
And Mac ... much better! Maybe 22 and 16 without pale. ;)
|
72
|
+
|
73
|
+
Oliver
|
74
|
+
|
75
|
+
|
76
|
+
--
|
77
|
+
ubuntu-art mailing list
|
78
|
+
**HIDDEN**@lists.ubuntu.com
|
79
|
+
https://lists.ubuntu.com/mailman/listinfo/ubuntu-art
|
@@ -0,0 +1,73 @@
|
|
1
|
+
From **HIDDEN**@manpoints.net Sun May 31 10:34:01 2009
|
2
|
+
Return-Path: <**HIDDEN**@manpoints.net>
|
3
|
+
X-Original-To: **HIDDEN**@spam.la
|
4
|
+
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
+
Received: from 201-40-49-243.bsace702.dsl.brasiltelecom.net.br (201-40-49-243.bsace702.dsl.brasiltelecom.net.br [201.40.49.243])
|
6
|
+
by speedo.dreamhost.com (Postfix) with ESMTP id 4BDC714572F
|
7
|
+
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
8
|
+
Message-Id: <**HIDDEN**@201-40-49-243.bsace702.dsl.brasiltelecom.net.br>
|
9
|
+
From: "Leskovar L. Golda" <**HIDDEN**@manpoints.net>
|
10
|
+
To: **HIDDEN**@spam.la
|
11
|
+
Subject: My official mail blocked
|
12
|
+
Content-Type: text/html; charset="iso-8859-1"
|
13
|
+
Content-Transfer-Encoding: 7bit
|
14
|
+
MIME-Version: 1.0
|
15
|
+
Date: Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
20
|
+
<html>
|
21
|
+
<head>
|
22
|
+
<title></title>
|
23
|
+
<meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
|
24
|
+
</head>
|
25
|
+
<body bgcolor="#FFFFFF" topmargin="0" leftmargin="0" marginwidth="0" marginheight="0">
|
26
|
+
|
27
|
+
|
28
|
+
<table width="646" cellspacing="0" border="0" align="center" cellpadding="4">
|
29
|
+
<tr>
|
30
|
+
<td align="center"><font face="Arial" size="1" color="#000000">If you cannot see
|
31
|
+
the pictures and links below, please <a href="http://www.qicweman.cn/?abo=0C8C72B3E1C8648676158B">
|
32
|
+
click here</a> to view them.<br></font></td>
|
33
|
+
</tr>
|
34
|
+
|
35
|
+
</table>
|
36
|
+
|
37
|
+
<table width="742" cellspacing="0" border="0" align="center" cellpadding="0">
|
38
|
+
|
39
|
+
<tr valign="top">
|
40
|
+
<td width="475" style="border-left:1px solid #371E96;">
|
41
|
+
<a href="http://www.qicweman.cn/?ex=0C8C72B3E1C8648676158B">
|
42
|
+
<img alt="click to see the full version" src="http://www.qicweman.cn/d.jpg" style="border-width: 0px" /></a></td>
|
43
|
+
|
44
|
+
</tr>
|
45
|
+
|
46
|
+
<tr>
|
47
|
+
<td><br>
|
48
|
+
|
49
|
+
<div style="padding:10px;">
|
50
|
+
|
51
|
+
<span style="font-size:10px;color:#666666;font-family:arial;">You may also
|
52
|
+
respond to this email and subscribe to <i>Hizqru</i> by calling
|
53
|
+
1-085-417-9085, Monday-Friday, 8 a.m.-6 p.m. ET. Outside the U.S. and in
|
54
|
+
Canada, please call 1-254-403-7409.<br><br>
|
55
|
+
|
56
|
+
To opt out from receiving any future marketing-related emails from
|
57
|
+
Dqpjnjp, please <a style="color:#666666;" href="http://www.qicweman.cn/?jr=0C8C72B3E1C8648676158B&email=**HIDDEN**@spam.la">
|
58
|
+
click here</a>.<br />
|
59
|
+
Please be assured that we respect the privacy of our subscribers. To view
|
60
|
+
our privacy policy, please <a style="color:#666666;" href="http://www.qicweman.cn/?sj=0C8C72B3E1C8648676158B">
|
61
|
+
click here</a>.<br><br>
|
62
|
+
|
63
|
+
© 2009 Jdqofypy, Inc., 14 Poze Iylqod, 08th Floor, New York, NY 74172.<br></span>
|
64
|
+
|
65
|
+
</div>
|
66
|
+
</td>
|
67
|
+
</tr>
|
68
|
+
</table>
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
</body>
|
73
|
+
</html>
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. .. test_helper])
|
2
|
+
|
3
|
+
Testy.testing 'String' do
|
4
|
+
context 'tokenize' do
|
5
|
+
test 'split words' do |t|
|
6
|
+
tokens = "hello world".tokenize
|
7
|
+
t.check 'words are split',
|
8
|
+
:expect => %w[hello world],
|
9
|
+
:actual => tokens
|
10
|
+
end
|
11
|
+
|
12
|
+
test 'downcase words' do |t|
|
13
|
+
tokens = "Hello World".tokenize
|
14
|
+
t.check 'words are downcased',
|
15
|
+
:expect => %w[hello world],
|
16
|
+
:actual => tokens
|
17
|
+
end
|
18
|
+
|
19
|
+
test 'most symbols are stripped' do |t|
|
20
|
+
tokens = "hyphen-ated, under_score!".tokenize
|
21
|
+
t.check 'some symbols are left',
|
22
|
+
:expect => %w[hyphen-ated under_score],
|
23
|
+
:actual => tokens
|
24
|
+
end
|
25
|
+
|
26
|
+
test 'html tags are sanitized' do |t|
|
27
|
+
tokens = '<a href="http://example.org">example</a>'.tokenize
|
28
|
+
t.check 'only content of tags is retained',
|
29
|
+
:expect => %w[example],
|
30
|
+
:actual => tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
test 'some dots are ok' do |t|
|
34
|
+
tokens = 'example.org rocks. read it...'.tokenize
|
35
|
+
t.check 'infix dots are kept',
|
36
|
+
:expect => %w[example.org rocks read it],
|
37
|
+
:actual => tokens
|
38
|
+
|
39
|
+
tokens2 = '$1,000,000.00 or $1.000.000,00'.tokenize
|
40
|
+
t.check 'infix commas are kept',
|
41
|
+
:expect => %w[1,000,000.00 or 1.000.000,00],
|
42
|
+
:actual => tokens2
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. test_helper])
|
2
|
+
|
3
|
+
Testy.testing 'Groupie::Group' do
|
4
|
+
test 'can be serialized and loaded through YAML' do |t|
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
g = Groupie::Group.new 'group'
|
8
|
+
g.add %w[buy flowers]
|
9
|
+
g2 = YAML.load(g.to_yaml)
|
10
|
+
g2.add %w[buy candy]
|
11
|
+
t.check 'default value works for new entries',
|
12
|
+
:expect => 1,
|
13
|
+
:actual => g2.count('candy')
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
+
|
3
|
+
Testy.testing 'Groupie' do
|
4
|
+
test 'classification is certain' do |t|
|
5
|
+
g = Groupie.new
|
6
|
+
g[:spam].add %w[viagra]
|
7
|
+
g[:ham].add %w[flowers]
|
8
|
+
classification = g.classify 'viagra'
|
9
|
+
t.check 'viagra is',
|
10
|
+
:expect => {:spam => 1.0, :ham => 0.0},
|
11
|
+
:actual => classification
|
12
|
+
end
|
13
|
+
|
14
|
+
test 'classification is split between two groups' do |t|
|
15
|
+
g = Groupie.new
|
16
|
+
g[:spam].add %w[buy viagra now]
|
17
|
+
g[:ham].add %w[buy flowers for your mom]
|
18
|
+
classification = g.classify 'buy'
|
19
|
+
t.check 'buy is classified as',
|
20
|
+
:expect => {:spam => 0.5, :ham => 0.5},
|
21
|
+
:actual => classification
|
22
|
+
end
|
23
|
+
|
24
|
+
test 'classification is weighed more heavy in one group' do |t|
|
25
|
+
g = Groupie.new
|
26
|
+
g[:spam].add %w[buy viagra now]
|
27
|
+
g[:spam].add %w[buy cialis now]
|
28
|
+
g[:ham].add %w[buy flowers for your mom]
|
29
|
+
t.check 'buy is classified as',
|
30
|
+
:expect => {:spam => 2 / 3.0, :ham => 1 / 3.0},
|
31
|
+
:actual => g.classify('buy')
|
32
|
+
end
|
33
|
+
|
34
|
+
test 'classification works fine with more than two groups' do |t|
|
35
|
+
g = Groupie.new
|
36
|
+
g[:weight].add 'pound'
|
37
|
+
g[:currency].add 'pound'
|
38
|
+
g[:phone_key].add 'pound'
|
39
|
+
t.check 'pound is classified as',
|
40
|
+
:expect => {:weight => 1/3.0, :currency => 1/3.0, :phone_key => 1/3.0},
|
41
|
+
:actual => g.classify('pound')
|
42
|
+
end
|
43
|
+
|
44
|
+
test 'tokenized emails' do |t|
|
45
|
+
email = <<-EMAIL
|
46
|
+
I noticed your flirt
|
47
|
+
If you cannot see the pictures and links below, please click here to view them.
|
48
|
+
PHARMACY CLUB | UNSUBSCRIBE | YOUR PRIVACY RIGHTS
|
49
|
+
Copyright 2009 Zjfqq, all rights reserved
|
50
|
+
Customer Service Dept., 87 Hizq Iveox Street, Isahaylo, VS 25270
|
51
|
+
EMAIL
|
52
|
+
email2 = <<-EMAIL
|
53
|
+
Re: Your subscribe #976589
|
54
|
+
Tell a friend · Download latest version See this email as a webpage
|
55
|
+
Hello!
|
56
|
+
Shipped Privately And Discreetly To Your Door!
|
57
|
+
We want to put a great big grin on your face in 2009. You'll be to rejoice all year.
|
58
|
+
Unsubscribe · Lost Password · Account Settings · Help · Terms of Service · Privacy
|
59
|
+
Ottho Heldringstraat 2, 31719 AZ Amsterdam, The Netherlands
|
60
|
+
EMAIL
|
61
|
+
email3 = <<-EMAIL
|
62
|
+
Re: [ubuntu-art] [Breathe] Network Manager-icons
|
63
|
+
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
64
|
+
> Hello,
|
65
|
+
>
|
66
|
+
> I think the notify-osd icons have a completely different style, which
|
67
|
+
> is looking great within the notification bubbles, but i doubt it'd
|
68
|
+
> look great to have the notify-osd wifi icons in the panel. I think the
|
69
|
+
> drawing of the notification- wifi icons should be done afterwards, and
|
70
|
+
> if they should be based on those of the icon set, they could be made
|
71
|
+
> smoother, and possibly desaturated for some of them, to avoid drawing
|
72
|
+
> too much attention from the user when popping up.
|
73
|
+
>
|
74
|
+
> Cordially, SD.
|
75
|
+
EMAIL
|
76
|
+
g = Groupie.new
|
77
|
+
g[:spam].add email.tokenize
|
78
|
+
g[:spam].add email2.tokenize
|
79
|
+
g[:ham].add email3.tokenize
|
80
|
+
c = g.classify('discreetly')
|
81
|
+
t.check 'classification of "discreetly" is spam',
|
82
|
+
:expect => true,
|
83
|
+
:actual => c[:spam] > c[:ham]
|
84
|
+
c2 = g.classify('user')
|
85
|
+
t.check 'classification of "user" is ham',
|
86
|
+
:expect => true,
|
87
|
+
:actual => c2[:ham] > c2[:spam]
|
88
|
+
end
|
89
|
+
|
90
|
+
test 'tokenized html emails' do |t|
|
91
|
+
g = Groupie.new
|
92
|
+
spam_tokens = File.read(File.join(File.dirname(__FILE__),
|
93
|
+
%w[fixtures spam spam.la-44118014.txt])).tokenize
|
94
|
+
ham_tokens = File.read(File.join(File.dirname(__FILE__),
|
95
|
+
%w[fixtures ham spam.la-44116217.txt])).tokenize
|
96
|
+
g[:spam].add spam_tokens
|
97
|
+
g[:ham].add ham_tokens
|
98
|
+
|
99
|
+
c = g.classify 'user'
|
100
|
+
t.check 'classification of the word "user" is ham',
|
101
|
+
:expect => true,
|
102
|
+
:actual => (c[:ham] > c[:spam])
|
103
|
+
|
104
|
+
c = g.classify_text(spam_tokens)
|
105
|
+
t.check 'classification of spam email is spam',
|
106
|
+
:expect => true,
|
107
|
+
:actual => (c[:spam] > c[:ham])
|
108
|
+
end
|
109
|
+
|
110
|
+
test 'classify a text' do |t|
|
111
|
+
g = Groupie.new
|
112
|
+
g[:spam].add %w[buy viagra now to grow fast]
|
113
|
+
g[:spam].add %w[buy cialis on our website]
|
114
|
+
g[:ham].add %w[buy flowers for your mom]
|
115
|
+
result = g.classify_text "Grow flowers to sell on our website".tokenize
|
116
|
+
t.check 'classification of a spammy text is spam',
|
117
|
+
:expect => true,
|
118
|
+
:actual => result[:spam] > result[:ham]
|
119
|
+
result2 = g.classify_text "Grow flowers to give to your mom".tokenize
|
120
|
+
t.check 'classification of a non-spammy text is ham',
|
121
|
+
:expect => true,
|
122
|
+
:actual => result2[:ham] > result2[:spam]
|
123
|
+
end
|
124
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: groupie
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Wes Oldenbeuving
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-07-25 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: testy
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Group and classify text based on likelyhood of being included in a text of a specific category
|
36
|
+
email: narnach@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- LICENSE
|
46
|
+
- Rakefile
|
47
|
+
- VERSION
|
48
|
+
- groupie.gemspec
|
49
|
+
- lib/groupie.rb
|
50
|
+
- lib/groupie/core_ext/string.rb
|
51
|
+
- lib/groupie/group.rb
|
52
|
+
- readme.rdoc
|
53
|
+
- test/fixtures/ham/spam.la-44116217.txt
|
54
|
+
- test/fixtures/spam/spam.la-44118014.txt
|
55
|
+
- test/groupie/core_ext/string_test.rb
|
56
|
+
- test/groupie/group_test.rb
|
57
|
+
- test/groupie_test.rb
|
58
|
+
- test/test_helper.rb
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: http://github.com/Narnach/groupie
|
61
|
+
licenses: []
|
62
|
+
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options:
|
65
|
+
- --charset=UTF-8
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
requirements: []
|
87
|
+
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 1.3.7
|
90
|
+
signing_key:
|
91
|
+
specification_version: 3
|
92
|
+
summary: Group and classify text
|
93
|
+
test_files:
|
94
|
+
- test/groupie/core_ext/string_test.rb
|
95
|
+
- test/groupie/group_test.rb
|
96
|
+
- test/groupie_test.rb
|
97
|
+
- test/test_helper.rb
|