groupie 0.2.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +24 -0
- data/.github/dependabot.yml +13 -0
- data/.github/workflows/gem.yml +16 -0
- data/.github/workflows/rspec.yml +22 -0
- data/.github/workflows/rubocop.yml +26 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.rubocop.yml +53 -0
- data/CHANGELOG.md +107 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +71 -0
- data/LICENSE.txt +21 -0
- data/README.md +140 -0
- data/Rakefile +7 -48
- data/SECURITY.md +18 -0
- data/bin/console +15 -0
- data/bin/rubocop +2 -0
- data/bin/setup +9 -0
- data/groupie.gemspec +32 -57
- data/lib/groupie/group.rb +19 -5
- data/lib/groupie/version.rb +10 -0
- data/lib/groupie.rb +145 -51
- metadata +56 -85
- data/.document +0 -5
- data/LICENSE +0 -20
- data/VERSION +0 -1
- data/lib/groupie/core_ext/string.rb +0 -17
- data/readme.rdoc +0 -27
- data/spec/fixtures/ham/email_ham1.txt +0 -13
- data/spec/fixtures/ham/spam.la-44116217.txt +0 -79
- data/spec/fixtures/spam/email_spam1.txt +0 -5
- data/spec/fixtures/spam/email_spam2.txt +0 -7
- data/spec/fixtures/spam/spam.la-44118014.txt +0 -73
- data/spec/groupie/core_ext/string_spec.rb +0 -37
- data/spec/groupie/group_spec.rb +0 -12
- data/spec/groupie_spec.rb +0 -130
- data/spec/spec_helper.rb +0 -1
data/bin/setup
ADDED
data/groupie.gemspec
CHANGED
@@ -1,63 +1,38 @@
|
|
1
|
-
#
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
-
# -*- encoding: utf-8 -*-
|
1
|
+
# frozen_string_literal: true
|
5
2
|
|
6
|
-
|
7
|
-
s.name = %q{groupie}
|
8
|
-
s.version = "0.2.2"
|
3
|
+
require_relative 'lib/groupie/version'
|
9
4
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE"
|
17
|
-
]
|
18
|
-
s.files = [
|
19
|
-
".document",
|
20
|
-
"LICENSE",
|
21
|
-
"Rakefile",
|
22
|
-
"VERSION",
|
23
|
-
"groupie.gemspec",
|
24
|
-
"lib/groupie.rb",
|
25
|
-
"lib/groupie/core_ext/string.rb",
|
26
|
-
"lib/groupie/group.rb",
|
27
|
-
"readme.rdoc",
|
28
|
-
"spec/fixtures/ham/email_ham1.txt",
|
29
|
-
"spec/fixtures/ham/spam.la-44116217.txt",
|
30
|
-
"spec/fixtures/spam/email_spam1.txt",
|
31
|
-
"spec/fixtures/spam/email_spam2.txt",
|
32
|
-
"spec/fixtures/spam/spam.la-44118014.txt",
|
33
|
-
"spec/groupie/core_ext/string_spec.rb",
|
34
|
-
"spec/groupie/group_spec.rb",
|
35
|
-
"spec/groupie_spec.rb",
|
36
|
-
"spec/spec_helper.rb"
|
37
|
-
]
|
38
|
-
s.homepage = %q{http://github.com/Narnach/groupie}
|
39
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
40
|
-
s.require_paths = ["lib"]
|
41
|
-
s.rubygems_version = %q{1.3.7}
|
42
|
-
s.summary = %q{Group and classify text}
|
43
|
-
s.test_files = [
|
44
|
-
"spec/groupie/core_ext/string_spec.rb",
|
45
|
-
"spec/groupie/group_spec.rb",
|
46
|
-
"spec/groupie_spec.rb",
|
47
|
-
"spec/spec_helper.rb"
|
48
|
-
]
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'groupie'
|
7
|
+
spec.version = Groupie::VERSION
|
8
|
+
spec.authors = ['Wes Oldenbeuving']
|
9
|
+
spec.email = ['wes@narnach.com']
|
49
10
|
|
50
|
-
|
51
|
-
|
52
|
-
|
11
|
+
spec.summary = 'Library to help you group texts and classify new ones'
|
12
|
+
spec.description = 'Groupie is a simple way to group texts and classify new texts as being a likely member' \
|
13
|
+
' of one of the defined groups. Think of bayesian spam filters.'
|
14
|
+
spec.homepage = 'https://github.com/Narnach/groupie'
|
15
|
+
spec.license = 'MIT'
|
16
|
+
spec.required_ruby_version = '>= 2.6.0' # EOL for 2.6 is 2022-03-31, so support this as the minimum for now
|
53
17
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
18
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
19
|
+
spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
|
20
|
+
spec.metadata['changelog_uri'] = 'https://github.com/Narnach/groupie/blob/stable/CHANGELOG.md'
|
21
|
+
# Require multi-factor authentication to publish the gem
|
22
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
23
|
+
|
24
|
+
# Specify which files should be added to the gem when it is released.
|
25
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
26
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
27
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
61
28
|
end
|
62
|
-
|
29
|
+
# spec.bindir = "exe"
|
30
|
+
# spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
31
|
+
spec.require_paths = ['lib']
|
63
32
|
|
33
|
+
# Uncomment to register a new dependency of your gem
|
34
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
35
|
+
|
36
|
+
# For more information and examples about making a new gem, checkout our
|
37
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
38
|
+
end
|
data/lib/groupie/group.rb
CHANGED
@@ -1,10 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Groupie
|
4
|
+
# Group represents a group or category that words can be classified into.
|
2
5
|
class Group
|
3
|
-
|
6
|
+
attr_reader :word_counts, :total_word_count
|
7
|
+
|
8
|
+
def initialize(name, groupie)
|
4
9
|
@name = name
|
10
|
+
@groupie = groupie
|
5
11
|
@word_counts = {}
|
12
|
+
@total_word_count = 0
|
6
13
|
end
|
7
|
-
|
14
|
+
|
8
15
|
def words
|
9
16
|
@word_counts.keys
|
10
17
|
end
|
@@ -17,16 +24,23 @@ class Groupie
|
|
17
24
|
nil
|
18
25
|
end
|
19
26
|
|
27
|
+
alias << add
|
28
|
+
|
20
29
|
# Return the count for a specific +word+.
|
21
30
|
def count(word)
|
22
31
|
@word_counts[word] || 0
|
23
32
|
end
|
24
33
|
|
34
|
+
private
|
35
|
+
|
25
36
|
# Add a single word and count it.
|
26
37
|
def add_word(word)
|
27
38
|
@word_counts[word] ||= 0
|
28
|
-
@word_counts[word] += 1
|
39
|
+
current_count = @word_counts[word] += 1
|
40
|
+
@total_word_count += 1
|
41
|
+
# If this word is new for this Group, it might be new for the entire Groupie
|
42
|
+
@groupie.add_word(word) if current_count == 1
|
43
|
+
nil
|
29
44
|
end
|
30
|
-
private :add_word
|
31
45
|
end
|
32
|
-
end
|
46
|
+
end
|
data/lib/groupie.rb
CHANGED
@@ -1,72 +1,166 @@
|
|
1
|
-
|
2
|
-
$LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
|
3
|
-
require 'groupie/group'
|
4
|
-
require 'groupie/core_ext/string'
|
1
|
+
# frozen_string_literal: true
|
5
2
|
|
3
|
+
require_relative 'groupie/version'
|
4
|
+
require_relative 'groupie/group'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
# Groupie is a text grouper and classifier, using naive Bayesian filtering.
|
6
8
|
class Groupie
|
7
|
-
|
9
|
+
# Wrap all errors we raise in this so our own errors are recognizable.
|
10
|
+
class Error < StandardError; end
|
11
|
+
|
12
|
+
attr_accessor :smart_weight
|
13
|
+
|
14
|
+
# @param [true, false] smart_weight (false) Whether smart weight is enabled or not.
|
15
|
+
def initialize(smart_weight: false)
|
8
16
|
@groups = {}
|
17
|
+
@smart_weight = smart_weight
|
18
|
+
@known_words = Set.new
|
9
19
|
end
|
10
20
|
|
21
|
+
# Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
|
22
|
+
# This attempts to remove most common punctuation marks and types of whitespace.
|
23
|
+
#
|
24
|
+
# @param [String, #to_s] object
|
25
|
+
# @return [Array<String>]
|
26
|
+
def self.tokenize(object)
|
27
|
+
object
|
28
|
+
.to_s
|
29
|
+
.downcase
|
30
|
+
.gsub(/\s/, ' ')
|
31
|
+
.gsub(/[$']/, '')
|
32
|
+
.gsub(/<[^>]+?>|[^\w -.,]/, '')
|
33
|
+
.split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
|
34
|
+
end
|
35
|
+
|
36
|
+
# Access an existing Group or create a new one.
|
37
|
+
#
|
38
|
+
# @param [Object] group The name of the group to access.
|
39
|
+
# @return [Groupie::Group] An existing or new group identified by +group+.
|
11
40
|
def [](group)
|
12
|
-
@groups[group] ||= Group.new(group)
|
41
|
+
@groups[group] ||= Group.new(group, self)
|
13
42
|
end
|
14
43
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
raise "Invalid strategy: #{strategy}"
|
28
|
-
end
|
29
|
-
next sum
|
44
|
+
# Classify a text by taking the average of all word classifications.
|
45
|
+
#
|
46
|
+
# @param [Array<String>] words List of words to be classified
|
47
|
+
# @param [Symbol] strategy
|
48
|
+
# @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
|
49
|
+
# @raise [Groupie::Error] Raise when an invalid strategy is provided
|
50
|
+
def classify_text(words, strategy = :sum)
|
51
|
+
words &= unique_words if strategy == :unique
|
52
|
+
group_score_sums, hits = calculate_group_scores(words, strategy)
|
53
|
+
|
54
|
+
group_score_sums.each.with_object({}) do |(group, sum), averages|
|
55
|
+
averages[group] = hits.positive? ? sum / hits : 0
|
30
56
|
end
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
57
|
+
end
|
58
|
+
|
59
|
+
# Classify a single word against all groups, returning the probability distribution.
|
60
|
+
#
|
61
|
+
# @param [String] entry A word to be classified
|
62
|
+
# @param [Symbol] strategy (:sum) the strategy to use on the score
|
63
|
+
# @return [Hash<Object, Float>] Hash with <group, probability> pairings.
|
64
|
+
# Probabilities are always in 0.0..1.0, and add up to 1.0 (i.e. it's a probability distribution)
|
65
|
+
# @raise [Groupie::Error] Raise when an invalid strategy is provided
|
66
|
+
def classify(entry, strategy = :sum)
|
67
|
+
# Calculate default weight once outside of the loop
|
68
|
+
default_weight = self.default_weight
|
69
|
+
# Each group calculates the count, then reduces it to a score: <group name, score>
|
70
|
+
per_group_score = @groups.transform_values do |group|
|
71
|
+
apply_count_strategy(default_weight + group.count(entry), strategy)
|
45
72
|
end
|
46
|
-
|
73
|
+
# When we have no scores, we have no results, so abort early
|
74
|
+
# Note that when smart_weight is enabled we always have a score.
|
75
|
+
total_score = per_group_score.values.sum
|
76
|
+
return {} if total_score.zero?
|
77
|
+
|
78
|
+
# Final results must be within 0.0..1.0, so divide each score by the total score
|
79
|
+
per_group_score.transform_values { |group_score| group_score.to_f / total_score }
|
47
80
|
end
|
48
81
|
|
49
|
-
#
|
50
|
-
|
82
|
+
# Return a word score dictionary that excludes the 4th quartile most popular words.
|
83
|
+
# Why do this? So the most common (and thus meaningless) words are ignored
|
84
|
+
# and less common words gain more predictive power.
|
85
|
+
#
|
86
|
+
# This is used by the :unique strategy of the classifier.
|
87
|
+
#
|
88
|
+
# @return [Hash<String, Integer>]
|
89
|
+
def unique_words
|
90
|
+
# Iterate over all Groups and merge their <word, count> dictionaries into one
|
91
|
+
total_count = @groups.inject({}) do |total, (_name, group)|
|
92
|
+
total.merge!(group.word_counts) { |_key, o, n| o + n }
|
93
|
+
end
|
94
|
+
# Extract the word count that's at the top 75%
|
95
|
+
top_quartile_index = [((total_count.size * 3) / 4) - 1, 1].max
|
96
|
+
top_quartile_frequency = total_count.values.sort[top_quartile_index]
|
97
|
+
# Throw out all words which have a count that's above this frequency
|
98
|
+
total_count.reject! { |_word, count| count > top_quartile_frequency }
|
99
|
+
total_count.keys
|
100
|
+
end
|
101
|
+
|
102
|
+
# Default weight is used when +smart_weight+ is enabled.
|
103
|
+
# Each word's count is increased by the +default_weight+ value,
|
104
|
+
# which is the average frequency of each unique word we know about.
|
105
|
+
#
|
106
|
+
# Example: if we have indexed 1000 total words, of which 500 were unique,
|
107
|
+
# the default_weight would be 1000/500=2.0
|
108
|
+
#
|
109
|
+
# @return [Float] The default weight for all words
|
110
|
+
def default_weight
|
111
|
+
# Default weight only applies when smart weight is enabled
|
112
|
+
return 0.0 unless smart_weight
|
113
|
+
|
114
|
+
# If we don't know any words, the weight is also zero
|
115
|
+
return 0.0 unless @known_words.any?
|
116
|
+
|
117
|
+
# Gather counts and calculate
|
118
|
+
total_words = @groups.each_value.sum(&:total_word_count)
|
119
|
+
total_unique_words = @known_words.count
|
120
|
+
total_words / total_unique_words.to_f
|
121
|
+
end
|
122
|
+
|
123
|
+
# Private method used by Groups to register known words with the Group.
|
124
|
+
def add_word(word)
|
125
|
+
@known_words << word
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
|
130
|
+
# Calculate grouped scores
|
131
|
+
#
|
132
|
+
# @param [Array<String>] words
|
133
|
+
# @param [Symbol] strategy
|
134
|
+
# @return [Array<Enumerator<String>, Integer>] a Hash with <group, score> pairs and an integer with the number of hits
|
135
|
+
def calculate_group_scores(words, strategy)
|
51
136
|
hits = 0
|
52
|
-
group_score_sums = words.
|
137
|
+
group_score_sums = words.each.with_object({}) do |word, results|
|
53
138
|
word_results = classify(word, strategy)
|
54
139
|
next results if word_results.empty?
|
55
|
-
hits += 1
|
56
|
-
results.merge(word_results) do |key, old, new|
|
57
|
-
old + new
|
58
|
-
end
|
59
|
-
end
|
60
140
|
|
61
|
-
|
62
|
-
|
63
|
-
averages[group] = hits > 0 ? sum / hits : 0
|
141
|
+
hits += 1
|
142
|
+
results.merge!(word_results) { |_key, old, new| old + new }
|
64
143
|
end
|
65
144
|
|
66
|
-
|
145
|
+
[group_score_sums, hits]
|
67
146
|
end
|
68
147
|
|
69
|
-
|
70
|
-
|
148
|
+
# Helper function to reduce a raw word count to a strategy-modified weight.
|
149
|
+
# @param [Integer] count
|
150
|
+
# @param [Symbol] strategy
|
151
|
+
# @return [Integer, Float]
|
152
|
+
# @raise [Groupie::Error] Raise when an invalid strategy is provided
|
153
|
+
def apply_count_strategy(count, strategy)
|
154
|
+
case strategy
|
155
|
+
when :sum
|
156
|
+
# keep count
|
157
|
+
when :sqrt, :unique
|
158
|
+
count = Math.sqrt(count)
|
159
|
+
when :log
|
160
|
+
count = Math.log10(count) if count.positive?
|
161
|
+
else
|
162
|
+
raise Error, "Invalid strategy: #{strategy}"
|
163
|
+
end
|
164
|
+
count
|
71
165
|
end
|
72
|
-
end
|
166
|
+
end
|
metadata
CHANGED
@@ -1,100 +1,71 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: groupie
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease: false
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
- 2
|
10
|
-
version: 0.2.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
11
5
|
platform: ruby
|
12
|
-
authors:
|
6
|
+
authors:
|
13
7
|
- Wes Oldenbeuving
|
14
|
-
autorequire:
|
8
|
+
autorequire:
|
15
9
|
bindir: bin
|
16
10
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
prerelease: false
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ">="
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
version: "0"
|
33
|
-
type: :development
|
34
|
-
version_requirements: *id001
|
35
|
-
description: Group and classify text based on likelyhood of being included in a text of a specific category
|
36
|
-
email: narnach@gmail.com
|
11
|
+
date: 2022-02-16 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Groupie is a simple way to group texts and classify new texts as being
|
14
|
+
a likely member of one of the defined groups. Think of bayesian spam filters.
|
15
|
+
email:
|
16
|
+
- wes@narnach.com
|
37
17
|
executables: []
|
38
|
-
|
39
18
|
extensions: []
|
40
|
-
|
41
|
-
|
42
|
-
-
|
43
|
-
|
44
|
-
- .
|
45
|
-
-
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- ".github/ISSUE_TEMPLATE/bug_report.md"
|
22
|
+
- ".github/ISSUE_TEMPLATE/feature_request.md"
|
23
|
+
- ".github/dependabot.yml"
|
24
|
+
- ".github/workflows/gem.yml"
|
25
|
+
- ".github/workflows/rspec.yml"
|
26
|
+
- ".github/workflows/rubocop.yml"
|
27
|
+
- ".gitignore"
|
28
|
+
- ".rspec"
|
29
|
+
- ".rubocop.yml"
|
30
|
+
- CHANGELOG.md
|
31
|
+
- Gemfile
|
32
|
+
- Gemfile.lock
|
33
|
+
- LICENSE.txt
|
34
|
+
- README.md
|
46
35
|
- Rakefile
|
47
|
-
-
|
36
|
+
- SECURITY.md
|
37
|
+
- bin/console
|
38
|
+
- bin/rubocop
|
39
|
+
- bin/setup
|
48
40
|
- groupie.gemspec
|
49
41
|
- lib/groupie.rb
|
50
|
-
- lib/groupie/core_ext/string.rb
|
51
42
|
- lib/groupie/group.rb
|
52
|
-
-
|
53
|
-
|
54
|
-
|
55
|
-
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
licenses: []
|
65
|
-
|
66
|
-
post_install_message:
|
67
|
-
rdoc_options:
|
68
|
-
- --charset=UTF-8
|
69
|
-
require_paths:
|
43
|
+
- lib/groupie/version.rb
|
44
|
+
homepage: https://github.com/Narnach/groupie
|
45
|
+
licenses:
|
46
|
+
- MIT
|
47
|
+
metadata:
|
48
|
+
homepage_uri: https://github.com/Narnach/groupie
|
49
|
+
source_code_uri: https://github.com/Narnach/groupie
|
50
|
+
changelog_uri: https://github.com/Narnach/groupie/blob/stable/CHANGELOG.md
|
51
|
+
rubygems_mfa_required: 'true'
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
require_paths:
|
70
55
|
- lib
|
71
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
-
|
73
|
-
requirements:
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
74
58
|
- - ">="
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
version: "0"
|
80
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
|
-
requirements:
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.6.0
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
83
63
|
- - ">="
|
84
|
-
- !ruby/object:Gem::Version
|
85
|
-
|
86
|
-
segments:
|
87
|
-
- 0
|
88
|
-
version: "0"
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
89
66
|
requirements: []
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
summary: Group and classify text
|
96
|
-
test_files:
|
97
|
-
- spec/groupie/core_ext/string_spec.rb
|
98
|
-
- spec/groupie/group_spec.rb
|
99
|
-
- spec/groupie_spec.rb
|
100
|
-
- spec/spec_helper.rb
|
67
|
+
rubygems_version: 3.3.3
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: Library to help you group texts and classify new ones
|
71
|
+
test_files: []
|
data/.document
DELETED
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Wes Oldenbeuving
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.2.2
|
@@ -1,17 +0,0 @@
|
|
1
|
-
class Groupie
|
2
|
-
module CoreExt
|
3
|
-
module String
|
4
|
-
def tokenize
|
5
|
-
downcase.
|
6
|
-
gsub(/\s/," ").
|
7
|
-
gsub(/[$']/,'').
|
8
|
-
gsub(/<[^>]+?>|[^\w -.,]/,'').
|
9
|
-
split(" ").map {|str| str.gsub(/\A['"]+|[!,."']+\Z/,'')}
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
class String
|
16
|
-
include Groupie::CoreExt::String
|
17
|
-
end
|
data/readme.rdoc
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
= Groupie
|
2
|
-
|
3
|
-
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
4
|
-
|
5
|
-
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
6
|
-
|
7
|
-
== Goals
|
8
|
-
|
9
|
-
Groupie is a 'fun' project that has the following goals, in descending order of importance:
|
10
|
-
* Have fun playing with code
|
11
|
-
* Play with Bayesian-like (spam) filtering
|
12
|
-
|
13
|
-
== Current functionality
|
14
|
-
|
15
|
-
Current funcionality includes:
|
16
|
-
* Tokenize an input text to prepare it for grouping.
|
17
|
-
* Strip XML and HTML tag.
|
18
|
-
* Keep certain infix characters, such as period and comma.
|
19
|
-
* Add texts (as an Array of Strings) to any number of groups.
|
20
|
-
* Classify a single word to check the likelihood it belongs to each group.
|
21
|
-
* Do classification for complete (tokenized) texts.
|
22
|
-
|
23
|
-
== License
|
24
|
-
|
25
|
-
As always, the code is licensed under the MIT license.
|
26
|
-
|
27
|
-
Wes Oldenbeuving
|
@@ -1,13 +0,0 @@
|
|
1
|
-
Re: [ubuntu-art] [Breathe] Network Manager-icons
|
2
|
-
Am Sonntag, den 31.05.2009, 17:53 +0200 schrieb Steve Dodier:
|
3
|
-
> Hello,
|
4
|
-
>
|
5
|
-
> I think the notify-osd icons have a completely different style, which
|
6
|
-
> is looking great within the notification bubbles, but i doubt it'd
|
7
|
-
> look great to have the notify-osd wifi icons in the panel. I think the
|
8
|
-
> drawing of the notification- wifi icons should be done afterwards, and
|
9
|
-
> if they should be based on those of the icon set, they could be made
|
10
|
-
> smoother, and possibly desaturated for some of them, to avoid drawing
|
11
|
-
> too much attention from the user when popping up.
|
12
|
-
>
|
13
|
-
> Cordially, SD.
|