groupie 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/LICENSE +20 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/groupie.gemspec +60 -0
- data/lib/groupie.rb +51 -0
- data/lib/groupie/core_ext/string.rb +17 -0
- data/lib/groupie/group.rb +32 -0
- data/readme.rdoc +28 -0
- data/test/fixtures/ham/spam.la-44116217.txt +79 -0
- data/test/fixtures/spam/spam.la-44118014.txt +73 -0
- data/test/groupie/core_ext/string_test.rb +45 -0
- data/test/groupie/group_test.rb +15 -0
- data/test/groupie_test.rb +124 -0
- data/test/test_helper.rb +3 -0
- metadata +97 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Wes Oldenbeuving
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "groupie"
|
8
|
+
gem.summary = %Q{Group and classify text}
|
9
|
+
gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
10
|
+
gem.email = "narnach@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/Narnach/groupie"
|
12
|
+
gem.authors = ["Wes Oldenbeuving"]
|
13
|
+
gem.add_development_dependency "testy", ">= 0"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
rescue LoadError
|
36
|
+
task :rcov do
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test => :check_dependencies
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "groupie #{version}"
|
51
|
+
rdoc.rdoc_files.include('readme*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/groupie.gemspec
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{groupie}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Wes Oldenbeuving"]
|
12
|
+
s.date = %q{2010-07-25}
|
13
|
+
s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
14
|
+
s.email = %q{narnach@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".document",
|
20
|
+
"LICENSE",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"groupie.gemspec",
|
24
|
+
"lib/groupie.rb",
|
25
|
+
"lib/groupie/core_ext/string.rb",
|
26
|
+
"lib/groupie/group.rb",
|
27
|
+
"readme.rdoc",
|
28
|
+
"test/fixtures/ham/spam.la-44116217.txt",
|
29
|
+
"test/fixtures/spam/spam.la-44118014.txt",
|
30
|
+
"test/groupie/core_ext/string_test.rb",
|
31
|
+
"test/groupie/group_test.rb",
|
32
|
+
"test/groupie_test.rb",
|
33
|
+
"test/test_helper.rb"
|
34
|
+
]
|
35
|
+
s.homepage = %q{http://github.com/Narnach/groupie}
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.7}
|
39
|
+
s.summary = %q{Group and classify text}
|
40
|
+
s.test_files = [
|
41
|
+
"test/groupie/core_ext/string_test.rb",
|
42
|
+
"test/groupie/group_test.rb",
|
43
|
+
"test/groupie_test.rb",
|
44
|
+
"test/test_helper.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_development_dependency(%q<testy>, [">= 0"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<testy>, [">= 0"])
|
55
|
+
end
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<testy>, [">= 0"])
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
data/lib/groupie.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
lib_dir = File.expand_path(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
|
3
|
+
require 'groupie/group'
|
4
|
+
require 'groupie/core_ext/string'
|
5
|
+
|
6
|
+
class Groupie
|
7
|
+
def initialize
|
8
|
+
@groups = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def [](group)
|
12
|
+
@groups[group] ||= Group.new(group)
|
13
|
+
end
|
14
|
+
|
15
|
+
def classify(entry)
|
16
|
+
results = {}
|
17
|
+
total_count = @groups.inject(0) do |sum, name_group|
|
18
|
+
group = name_group.last
|
19
|
+
sum + group.count(entry)
|
20
|
+
end
|
21
|
+
return results if 0 == total_count
|
22
|
+
|
23
|
+
@groups.each do |name, group|
|
24
|
+
count = group.count(entry)
|
25
|
+
results[name] = count > 0 ? count.to_f / total_count : 0.0
|
26
|
+
end
|
27
|
+
return results
|
28
|
+
end
|
29
|
+
|
30
|
+
# Classify a text by taking the average of all word classifications.
|
31
|
+
def classify_text(words)
|
32
|
+
group_score_sums = words.inject({}) do |results, word|
|
33
|
+
word_results = classify(word)
|
34
|
+
results.merge(word_results) do |key, old, new|
|
35
|
+
old + new
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
words_count = words.size.to_f
|
40
|
+
averages={}
|
41
|
+
group_score_sums.each do |group, sum|
|
42
|
+
averages[group] = sum / words_count
|
43
|
+
end
|
44
|
+
|
45
|
+
averages
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.version
|
49
|
+
File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Groupie
|
2
|
+
module CoreExt
|
3
|
+
module String
|
4
|
+
def tokenize
|
5
|
+
downcase.
|
6
|
+
gsub(/\s/," ").
|
7
|
+
gsub(/[$']/,'').
|
8
|
+
gsub(/<[^>]+?>|[^\w -.,]/,'').
|
9
|
+
split(" ").map {|str| str.gsub(/[,.]+\Z/,'')}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class String
|
16
|
+
include Groupie::CoreExt::String
|
17
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
class Groupie
|
2
|
+
class Group
|
3
|
+
def initialize(name)
|
4
|
+
@name = name
|
5
|
+
@word_counts = {}
|
6
|
+
end
|
7
|
+
|
8
|
+
def words
|
9
|
+
@word_counts.keys
|
10
|
+
end
|
11
|
+
|
12
|
+
# Add new words to the group.
|
13
|
+
def add(*words)
|
14
|
+
words.flatten.each do |word|
|
15
|
+
add_word(word)
|
16
|
+
end
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# Return the count for a specific +word+.
|
21
|
+
def count(word)
|
22
|
+
@word_counts[word] || 0
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add a single word and count it.
|
26
|
+
def add_word(word)
|
27
|
+
@word_counts[word] ||= 0
|
28
|
+
@word_counts[word] += 1
|
29
|
+
end
|
30
|
+
private :add_word
|
31
|
+
end
|
32
|
+
end
|
data/readme.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= Groupie
|
2
|
+
|
3
|
+
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
4
|
+
|
5
|
+
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
|
+
|
7
|
+
== Goals
|
8
|
+
|
9
|
+
Groupie is a 'fun' project that has the following goals, in descending order of importance:
|
10
|
+
* Have fun playing with code
|
11
|
+
* Play with Bayesian-like (spam) filtering
|
12
|
+
* Check out the Testy BDD framework. It's pretty good for 60 lines of code!
|
13
|
+
|
14
|
+
== Current functionality
|
15
|
+
|
16
|
+
Current funcionality includes:
|
17
|
+
* Tokenize an input text to prepare it for grouping.
|
18
|
+
* Strip XML and HTML tag.
|
19
|
+
* Keep certain infix characters, such as period and comma.
|
20
|
+
* Add texts (as an Array of Strings) to any number of groups.
|
21
|
+
* Classify a single word to check the likelihood it belongs to each group.
|
22
|
+
* Do classification for complete (tokenized) texts.
|
23
|
+
|
24
|
+
== License
|
25
|
+
|
26
|
+
As always, the code is licensed under the MIT license.
|
27
|
+
|
28
|
+
Wes Oldenbeuving
|
@@ -0,0 +1,79 @@
|
|
1
|
+
From **HIDDEN**@lists.ubuntu.com Sun May 31 10:22:46 2009
|
2
|
+
Return-Path: <**HIDDEN**@lists.ubuntu.com>
|
3
|
+
X-Original-To: **HIDDEN**@spam.la
|
4
|
+
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
+
Received: from chlorine.canonical.com (chlorine.canonical.com [91.189.94.204])
|
6
|
+
by speedo.dreamhost.com (Postfix) with ESMTP id 18A4F145730
|
7
|
+
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:22:47 -0700 (PDT)
|
8
|
+
Received: from localhost ([127.0.0.1] helo=chlorine.canonical.com)
|
9
|
+
by chlorine.canonical.com with esmtp (Exim 4.60)
|
10
|
+
(envelope-from <**HIDDEN**@lists.ubuntu.com>)
|
11
|
+
id 1MAoKV-0000Uv-RB; Sun, 31 May 2009 17:56:15 +0100
|
12
|
+
Received: from smtp104.mail.ukl.yahoo.com ([77.238.184.36])
|
13
|
+
by chlorine.canonical.com with smtp (Exim 4.60)
|
14
|
+
(envelope-from <**HIDDEN**@yahoo.de>) id 1MAoKO-0000T5-0J
|
15
|
+
for **HIDDEN**@lists.ubuntu.com; Sun, 31 May 2009 17:56:08 +0100
|
16
|
+
Received: (qmail 95693 invoked from network); 31 May 2009 16:56:07 -0000
|
17
|
+
Received: from unknown (HELO ?192.168.1.33?) **HIDDEN**@88.5.92.30 with plain)
|
18
|
+
by smtp104.mail.ukl.yahoo.com with SMTP; 31 May 2009 16:56:07 -0000
|
19
|
+
X-Yahoo-SMTP: omQsrMiswBC_IZdIQhRgQAA3Gn6tTTc-
|
20
|
+
X-YMail-OSG: e1ihGm4VM1ljqtG.6IPGOps5aG8IYZJEPQLptGPSxphH174zk4rRTWYQJmj9MMc2nJwZjNEqUnYAjErWKypElvLWu0n.v8baMMlcOOELQK2IZfFaV5Ij3HUpUDWRbd0n6PCV5iFLHlyruq5CSGsiZvfME6HpngIO0RuAcin3rePXdzWpmPnTlZwuC3qjSE9N8wC4pdBdwfmYHy4EKSKFRCXUNzdy9DPgfwqrjiCTP_tqaWmpeUOqA2Os13l0j5d6acIpgo9DcW8P_1ENNVGjJ2Lk4XbZ0oc51M_BJ2n6DHMxoazT
|
21
|
+
X-Yahoo-Newman-Property: ymail-3
|
22
|
+
From: Oliver Scholtz 1 <**HIDDEN**@yahoo.de>
|
23
|
+
To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
24
|
+
In-Reply-To: <**HIDDEN**@mail.gmail.com>
|
25
|
+
References: <**HIDDEN**@yahoo.com>
|
26
|
+
<**HIDDEN**@jws141-laptop> <**HIDDEN**@yahoo.com>
|
27
|
+
<**HIDDEN**@isabel-desktop>
|
28
|
+
<**HIDDEN**@web95411.mail.in2.yahoo.com>
|
29
|
+
<**HIDDEN**@dani-desktop> <**HIDDEN**@yahoo.com>
|
30
|
+
<**HIDDEN**@mail.gmail.com>
|
31
|
+
<**HIDDEN**@mail.gmail.com>
|
32
|
+
Date: Sun, 31 May 2009 18:56:05 +0200
|
33
|
+
Message-Id: <**HIDDEN**@oliver-ubuntu>
|
34
|
+
Mime-Version: 1.0
|
35
|
+
X-Mailer: Evolution 2.26.1
|
36
|
+
Subject: Re: [ubuntu-art] [Breathe] Network Manager-icons
|
37
|
+
X-BeenThere: **HIDDEN**@lists.ubuntu.com
|
38
|
+
X-Mailman-Version: 2.1.8
|
39
|
+
Precedence: list
|
40
|
+
Reply-To: Discussion on Ubuntu artwork <**HIDDEN**@lists.ubuntu.com>
|
41
|
+
List-Id: Discussion on Ubuntu artwork <ubuntu-art.lists.ubuntu.com>
|
42
|
+
List-Unsubscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
43
|
+
<**HIDDEN**@lists.ubuntu.com?subject=unsubscribe>
|
44
|
+
List-Archive: <https://lists.ubuntu.com/archives/ubuntu-art>
|
45
|
+
List-Post: <**HIDDEN**@lists.ubuntu.com>
|
46
|
+
List-Help: <**HIDDEN**@lists.ubuntu.com?subject=help>
|
47
|
+
List-Subscribe: <https://lists.ubuntu.com/mailman/listinfo/ubuntu-art>,
|
48
|
+
<**HIDDEN**@lists.ubuntu.com?subject=subscribe>
|
49
|
+
Content-Type: text/plain; charset="us-ascii"
|
50
|
+
Content-Transfer-Encoding: 7bit
|
51
|
+
Sender: **HIDDEN**@lists.ubuntu.com
|
52
|
+
Errors-To: **HIDDEN**@lists.ubuntu.com
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
57
|
+
> Hello,
|
58
|
+
>
|
59
|
+
> I think the notify-osd icons have a completely different style, which
|
60
|
+
> is looking great within the notification bubbles, but i doubt it'd
|
61
|
+
> look great to have the notify-osd wifi icons in the panel. I think the
|
62
|
+
> drawing of the notification- wifi icons should be done afterwards, and
|
63
|
+
> if they should be based on those of the icon set, they could be made
|
64
|
+
> smoother, and possibly desaturated for some of them, to avoid drawing
|
65
|
+
> too much attention from the user when popping up.
|
66
|
+
>
|
67
|
+
> Cordially, SD.
|
68
|
+
|
69
|
+
+1
|
70
|
+
---
|
71
|
+
And Mac ... much better! Maybe 22 and 16 without pale. ;)
|
72
|
+
|
73
|
+
Oliver
|
74
|
+
|
75
|
+
|
76
|
+
--
|
77
|
+
ubuntu-art mailing list
|
78
|
+
**HIDDEN**@lists.ubuntu.com
|
79
|
+
https://lists.ubuntu.com/mailman/listinfo/ubuntu-art
|
@@ -0,0 +1,73 @@
|
|
1
|
+
From **HIDDEN**@manpoints.net Sun May 31 10:34:01 2009
|
2
|
+
Return-Path: <**HIDDEN**@manpoints.net>
|
3
|
+
X-Original-To: **HIDDEN**@spam.la
|
4
|
+
Delivered-To: **HIDDEN**@speedo.dreamhost.com
|
5
|
+
Received: from 201-40-49-243.bsace702.dsl.brasiltelecom.net.br (201-40-49-243.bsace702.dsl.brasiltelecom.net.br [201.40.49.243])
|
6
|
+
by speedo.dreamhost.com (Postfix) with ESMTP id 4BDC714572F
|
7
|
+
for <**HIDDEN**@spam.la>; Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
8
|
+
Message-Id: <**HIDDEN**@201-40-49-243.bsace702.dsl.brasiltelecom.net.br>
|
9
|
+
From: "Leskovar L. Golda" <**HIDDEN**@manpoints.net>
|
10
|
+
To: **HIDDEN**@spam.la
|
11
|
+
Subject: My official mail blocked
|
12
|
+
Content-Type: text/html; charset="iso-8859-1"
|
13
|
+
Content-Transfer-Encoding: 7bit
|
14
|
+
MIME-Version: 1.0
|
15
|
+
Date: Sun, 31 May 2009 10:33:56 -0700 (PDT)
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
20
|
+
<html>
|
21
|
+
<head>
|
22
|
+
<title></title>
|
23
|
+
<meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
|
24
|
+
</head>
|
25
|
+
<body bgcolor="#FFFFFF" topmargin="0" leftmargin="0" marginwidth="0" marginheight="0">
|
26
|
+
|
27
|
+
|
28
|
+
<table width="646" cellspacing="0" border="0" align="center" cellpadding="4">
|
29
|
+
<tr>
|
30
|
+
<td align="center"><font face="Arial" size="1" color="#000000">If you cannot see
|
31
|
+
the pictures and links below, please <a href="http://www.qicweman.cn/?abo=0C8C72B3E1C8648676158B">
|
32
|
+
click here</a> to view them.<br></font></td>
|
33
|
+
</tr>
|
34
|
+
|
35
|
+
</table>
|
36
|
+
|
37
|
+
<table width="742" cellspacing="0" border="0" align="center" cellpadding="0">
|
38
|
+
|
39
|
+
<tr valign="top">
|
40
|
+
<td width="475" style="border-left:1px solid #371E96;">
|
41
|
+
<a href="http://www.qicweman.cn/?ex=0C8C72B3E1C8648676158B">
|
42
|
+
<img alt="click to see the full version" src="http://www.qicweman.cn/d.jpg" style="border-width: 0px" /></a></td>
|
43
|
+
|
44
|
+
</tr>
|
45
|
+
|
46
|
+
<tr>
|
47
|
+
<td><br>
|
48
|
+
|
49
|
+
<div style="padding:10px;">
|
50
|
+
|
51
|
+
<span style="font-size:10px;color:#666666;font-family:arial;">You may also
|
52
|
+
respond to this email and subscribe to <i>Hizqru</i> by calling
|
53
|
+
1-085-417-9085, Monday-Friday, 8 a.m.-6 p.m. ET. Outside the U.S. and in
|
54
|
+
Canada, please call 1-254-403-7409.<br><br>
|
55
|
+
|
56
|
+
To opt out from receiving any future marketing-related emails from
|
57
|
+
Dqpjnjp, please <a style="color:#666666;" href="http://www.qicweman.cn/?jr=0C8C72B3E1C8648676158B&email=**HIDDEN**@spam.la">
|
58
|
+
click here</a>.<br />
|
59
|
+
Please be assured that we respect the privacy of our subscribers. To view
|
60
|
+
our privacy policy, please <a style="color:#666666;" href="http://www.qicweman.cn/?sj=0C8C72B3E1C8648676158B">
|
61
|
+
click here</a>.<br><br>
|
62
|
+
|
63
|
+
© 2009 Jdqofypy, Inc., 14 Poze Iylqod, 08th Floor, New York, NY 74172.<br></span>
|
64
|
+
|
65
|
+
</div>
|
66
|
+
</td>
|
67
|
+
</tr>
|
68
|
+
</table>
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
</body>
|
73
|
+
</html>
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. .. test_helper])
|
2
|
+
|
3
|
+
Testy.testing 'String' do
|
4
|
+
context 'tokenize' do
|
5
|
+
test 'split words' do |t|
|
6
|
+
tokens = "hello world".tokenize
|
7
|
+
t.check 'words are split',
|
8
|
+
:expect => %w[hello world],
|
9
|
+
:actual => tokens
|
10
|
+
end
|
11
|
+
|
12
|
+
test 'downcase words' do |t|
|
13
|
+
tokens = "Hello World".tokenize
|
14
|
+
t.check 'words are downcased',
|
15
|
+
:expect => %w[hello world],
|
16
|
+
:actual => tokens
|
17
|
+
end
|
18
|
+
|
19
|
+
test 'most symbols are stripped' do |t|
|
20
|
+
tokens = "hyphen-ated, under_score!".tokenize
|
21
|
+
t.check 'some symbols are left',
|
22
|
+
:expect => %w[hyphen-ated under_score],
|
23
|
+
:actual => tokens
|
24
|
+
end
|
25
|
+
|
26
|
+
test 'html tags are sanitized' do |t|
|
27
|
+
tokens = '<a href="http://example.org">example</a>'.tokenize
|
28
|
+
t.check 'only content of tags is retained',
|
29
|
+
:expect => %w[example],
|
30
|
+
:actual => tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
test 'some dots are ok' do |t|
|
34
|
+
tokens = 'example.org rocks. read it...'.tokenize
|
35
|
+
t.check 'infix dots are kept',
|
36
|
+
:expect => %w[example.org rocks read it],
|
37
|
+
:actual => tokens
|
38
|
+
|
39
|
+
tokens2 = '$1,000,000.00 or $1.000.000,00'.tokenize
|
40
|
+
t.check 'infix commas are kept',
|
41
|
+
:expect => %w[1,000,000.00 or 1.000.000,00],
|
42
|
+
:actual => tokens2
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. test_helper])
|
2
|
+
|
3
|
+
Testy.testing 'Groupie::Group' do
|
4
|
+
test 'can be serialized and loaded through YAML' do |t|
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
g = Groupie::Group.new 'group'
|
8
|
+
g.add %w[buy flowers]
|
9
|
+
g2 = YAML.load(g.to_yaml)
|
10
|
+
g2.add %w[buy candy]
|
11
|
+
t.check 'default value works for new entries',
|
12
|
+
:expect => 1,
|
13
|
+
:actual => g2.count('candy')
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
+
|
3
|
+
Testy.testing 'Groupie' do
|
4
|
+
test 'classification is certain' do |t|
|
5
|
+
g = Groupie.new
|
6
|
+
g[:spam].add %w[viagra]
|
7
|
+
g[:ham].add %w[flowers]
|
8
|
+
classification = g.classify 'viagra'
|
9
|
+
t.check 'viagra is',
|
10
|
+
:expect => {:spam => 1.0, :ham => 0.0},
|
11
|
+
:actual => classification
|
12
|
+
end
|
13
|
+
|
14
|
+
test 'classification is split between two groups' do |t|
|
15
|
+
g = Groupie.new
|
16
|
+
g[:spam].add %w[buy viagra now]
|
17
|
+
g[:ham].add %w[buy flowers for your mom]
|
18
|
+
classification = g.classify 'buy'
|
19
|
+
t.check 'buy is classified as',
|
20
|
+
:expect => {:spam => 0.5, :ham => 0.5},
|
21
|
+
:actual => classification
|
22
|
+
end
|
23
|
+
|
24
|
+
test 'classification is weighed more heavy in one group' do |t|
|
25
|
+
g = Groupie.new
|
26
|
+
g[:spam].add %w[buy viagra now]
|
27
|
+
g[:spam].add %w[buy cialis now]
|
28
|
+
g[:ham].add %w[buy flowers for your mom]
|
29
|
+
t.check 'buy is classified as',
|
30
|
+
:expect => {:spam => 2 / 3.0, :ham => 1 / 3.0},
|
31
|
+
:actual => g.classify('buy')
|
32
|
+
end
|
33
|
+
|
34
|
+
test 'classification works fine with more than two groups' do |t|
|
35
|
+
g = Groupie.new
|
36
|
+
g[:weight].add 'pound'
|
37
|
+
g[:currency].add 'pound'
|
38
|
+
g[:phone_key].add 'pound'
|
39
|
+
t.check 'pound is classified as',
|
40
|
+
:expect => {:weight => 1/3.0, :currency => 1/3.0, :phone_key => 1/3.0},
|
41
|
+
:actual => g.classify('pound')
|
42
|
+
end
|
43
|
+
|
44
|
+
test 'tokenized emails' do |t|
|
45
|
+
email = <<-EMAIL
|
46
|
+
I noticed your flirt
|
47
|
+
If you cannot see the pictures and links below, please click here to view them.
|
48
|
+
PHARMACY CLUB | UNSUBSCRIBE | YOUR PRIVACY RIGHTS
|
49
|
+
Copyright 2009 Zjfqq, all rights reserved
|
50
|
+
Customer Service Dept., 87 Hizq Iveox Street, Isahaylo, VS 25270
|
51
|
+
EMAIL
|
52
|
+
email2 = <<-EMAIL
|
53
|
+
Re: Your subscribe #976589
|
54
|
+
Tell a friend · Download latest version See this email as a webpage
|
55
|
+
Hello!
|
56
|
+
Shipped Privately And Discreetly To Your Door!
|
57
|
+
We want to put a great big grin on your face in 2009. You'll be to rejoice all year.
|
58
|
+
Unsubscribe · Lost Password · Account Settings · Help · Terms of Service · Privacy
|
59
|
+
Ottho Heldringstraat 2, 31719 AZ Amsterdam, The Netherlands
|
60
|
+
EMAIL
|
61
|
+
email3 = <<-EMAIL
|
62
|
+
Re: [ubuntu-art] [Breathe] Network Manager-icons
|
63
|
+
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
64
|
+
> Hello,
|
65
|
+
>
|
66
|
+
> I think the notify-osd icons have a completely different style, which
|
67
|
+
> is looking great within the notification bubbles, but i doubt it'd
|
68
|
+
> look great to have the notify-osd wifi icons in the panel. I think the
|
69
|
+
> drawing of the notification- wifi icons should be done afterwards, and
|
70
|
+
> if they should be based on those of the icon set, they could be made
|
71
|
+
> smoother, and possibly desaturated for some of them, to avoid drawing
|
72
|
+
> too much attention from the user when popping up.
|
73
|
+
>
|
74
|
+
> Cordially, SD.
|
75
|
+
EMAIL
|
76
|
+
g = Groupie.new
|
77
|
+
g[:spam].add email.tokenize
|
78
|
+
g[:spam].add email2.tokenize
|
79
|
+
g[:ham].add email3.tokenize
|
80
|
+
c = g.classify('discreetly')
|
81
|
+
t.check 'classification of "discreetly" is spam',
|
82
|
+
:expect => true,
|
83
|
+
:actual => c[:spam] > c[:ham]
|
84
|
+
c2 = g.classify('user')
|
85
|
+
t.check 'classification of "user" is ham',
|
86
|
+
:expect => true,
|
87
|
+
:actual => c2[:ham] > c2[:spam]
|
88
|
+
end
|
89
|
+
|
90
|
+
test 'tokenized html emails' do |t|
|
91
|
+
g = Groupie.new
|
92
|
+
spam_tokens = File.read(File.join(File.dirname(__FILE__),
|
93
|
+
%w[fixtures spam spam.la-44118014.txt])).tokenize
|
94
|
+
ham_tokens = File.read(File.join(File.dirname(__FILE__),
|
95
|
+
%w[fixtures ham spam.la-44116217.txt])).tokenize
|
96
|
+
g[:spam].add spam_tokens
|
97
|
+
g[:ham].add ham_tokens
|
98
|
+
|
99
|
+
c = g.classify 'user'
|
100
|
+
t.check 'classification of the word "user" is ham',
|
101
|
+
:expect => true,
|
102
|
+
:actual => (c[:ham] > c[:spam])
|
103
|
+
|
104
|
+
c = g.classify_text(spam_tokens)
|
105
|
+
t.check 'classification of spam email is spam',
|
106
|
+
:expect => true,
|
107
|
+
:actual => (c[:spam] > c[:ham])
|
108
|
+
end
|
109
|
+
|
110
|
+
test 'classify a text' do |t|
|
111
|
+
g = Groupie.new
|
112
|
+
g[:spam].add %w[buy viagra now to grow fast]
|
113
|
+
g[:spam].add %w[buy cialis on our website]
|
114
|
+
g[:ham].add %w[buy flowers for your mom]
|
115
|
+
result = g.classify_text "Grow flowers to sell on our website".tokenize
|
116
|
+
t.check 'classification of a spammy text is spam',
|
117
|
+
:expect => true,
|
118
|
+
:actual => result[:spam] > result[:ham]
|
119
|
+
result2 = g.classify_text "Grow flowers to give to your mom".tokenize
|
120
|
+
t.check 'classification of a non-spammy text is ham',
|
121
|
+
:expect => true,
|
122
|
+
:actual => result2[:ham] > result2[:spam]
|
123
|
+
end
|
124
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: groupie
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Wes Oldenbeuving
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-07-25 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: testy
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Group and classify text based on likelyhood of being included in a text of a specific category
|
36
|
+
email: narnach@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- LICENSE
|
46
|
+
- Rakefile
|
47
|
+
- VERSION
|
48
|
+
- groupie.gemspec
|
49
|
+
- lib/groupie.rb
|
50
|
+
- lib/groupie/core_ext/string.rb
|
51
|
+
- lib/groupie/group.rb
|
52
|
+
- readme.rdoc
|
53
|
+
- test/fixtures/ham/spam.la-44116217.txt
|
54
|
+
- test/fixtures/spam/spam.la-44118014.txt
|
55
|
+
- test/groupie/core_ext/string_test.rb
|
56
|
+
- test/groupie/group_test.rb
|
57
|
+
- test/groupie_test.rb
|
58
|
+
- test/test_helper.rb
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: http://github.com/Narnach/groupie
|
61
|
+
licenses: []
|
62
|
+
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options:
|
65
|
+
- --charset=UTF-8
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
requirements: []
|
87
|
+
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 1.3.7
|
90
|
+
signing_key:
|
91
|
+
specification_version: 3
|
92
|
+
summary: Group and classify text
|
93
|
+
test_files:
|
94
|
+
- test/groupie/core_ext/string_test.rb
|
95
|
+
- test/groupie/group_test.rb
|
96
|
+
- test/groupie_test.rb
|
97
|
+
- test/test_helper.rb
|