groupie 0.2.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +24 -0
- data/.github/dependabot.yml +13 -0
- data/.github/workflows/gem.yml +16 -0
- data/.github/workflows/rspec.yml +22 -0
- data/.github/workflows/rubocop.yml +26 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.rubocop.yml +53 -0
- data/CHANGELOG.md +107 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +71 -0
- data/LICENSE.txt +21 -0
- data/README.md +140 -0
- data/Rakefile +7 -48
- data/SECURITY.md +18 -0
- data/bin/console +15 -0
- data/bin/rubocop +2 -0
- data/bin/setup +9 -0
- data/groupie.gemspec +32 -57
- data/lib/groupie/group.rb +19 -5
- data/lib/groupie/version.rb +10 -0
- data/lib/groupie.rb +145 -51
- metadata +56 -85
- data/.document +0 -5
- data/LICENSE +0 -20
- data/VERSION +0 -1
- data/lib/groupie/core_ext/string.rb +0 -17
- data/readme.rdoc +0 -27
- data/spec/fixtures/ham/email_ham1.txt +0 -13
- data/spec/fixtures/ham/spam.la-44116217.txt +0 -79
- data/spec/fixtures/spam/email_spam1.txt +0 -5
- data/spec/fixtures/spam/email_spam2.txt +0 -7
- data/spec/fixtures/spam/spam.la-44118014.txt +0 -73
- data/spec/groupie/core_ext/string_spec.rb +0 -37
- data/spec/groupie/group_spec.rb +0 -12
- data/spec/groupie_spec.rb +0 -130
- data/spec/spec_helper.rb +0 -1
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: dd7e47aef7d4ed19c206e46d5eb716991562ee2c257c5cf8488341f646801f2f
|
4
|
+
data.tar.gz: 692108e3c8c2b7d4b26a3c7d702133d0a5e01324d0c42cd9c4143b3a4b601b6c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: dba999961a8c6d7ba2999770d9125dc1c4b2c6468452c115a0eeb3256d8d484ef0cba8d975053e5e4e7d907afaf402063fb0b75c6d1c24b5bd3dc7698a251f35
|
7
|
+
data.tar.gz: cb5d99d029d237a37354b81b97a4d0a03b6224040ddbbbd0cd96cb3536ea0cfe4ecfb55871e35749b133df0178c97937f2b98523dc3504e4620b3a2473403478
|
@@ -0,0 +1,32 @@
|
|
1
|
+
---
|
2
|
+
name: Bug report
|
3
|
+
about: Create a report to help us improve
|
4
|
+
title: ''
|
5
|
+
labels: ''
|
6
|
+
assignees: ''
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
## Describe the bug
|
11
|
+
|
12
|
+
A clear and concise description of what the bug is.
|
13
|
+
|
14
|
+
## How to reproduce it
|
15
|
+
|
16
|
+
Include the minimum code sample required to reproduce the bug.
|
17
|
+
|
18
|
+
## Expected behavior
|
19
|
+
|
20
|
+
A clear and concise description of what you expected to happen.
|
21
|
+
|
22
|
+
## Context
|
23
|
+
|
24
|
+
Please describe:
|
25
|
+
|
26
|
+
* What version of Groupie did you use? (`bundle list|grep groupie`)
|
27
|
+
* What version of Ruby did you use? (`ruby -v`)
|
28
|
+
* What platform and architecture do you use? (macOS version (Intel vs Apple M1), Windows version, Linux distro/architecture)
|
29
|
+
|
30
|
+
## Additional context
|
31
|
+
|
32
|
+
Add any other context about the problem here.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
---
|
2
|
+
name: Feature request
|
3
|
+
about: Suggest an idea for this project
|
4
|
+
title: ''
|
5
|
+
labels: ''
|
6
|
+
assignees: ''
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
## Is your feature request related to a problem? Please describe.
|
11
|
+
|
12
|
+
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
13
|
+
|
14
|
+
## Describe the solution you'd like
|
15
|
+
|
16
|
+
A clear and concise description of what you want to happen.
|
17
|
+
|
18
|
+
## Describe alternatives you've considered
|
19
|
+
|
20
|
+
A clear and concise description of any alternative solutions or features you've considered.
|
21
|
+
|
22
|
+
## Additional context
|
23
|
+
|
24
|
+
Add any other context or screenshots about the feature request here.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically
|
2
|
+
# There's a lot of options, but for now let's keep it simple.
|
3
|
+
# Check every week (Monday 5:00 CET by default) for updates.
|
4
|
+
# Each package manager gets 5 non-security updates by default.
|
5
|
+
# Security updates bypass most configuration here and show up when found.
|
6
|
+
# We rely on DepFu for Ruby and Gem updates
|
7
|
+
version: 2
|
8
|
+
updates:
|
9
|
+
- package-ecosystem: github-actions
|
10
|
+
# Github Actions are checked for updates
|
11
|
+
directory: '/'
|
12
|
+
schedule:
|
13
|
+
interval: weekly
|
@@ -0,0 +1,16 @@
|
|
1
|
+
name: Gem building
|
2
|
+
|
3
|
+
on: [ pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
steps:
|
9
|
+
- uses: actions/checkout@v2
|
10
|
+
- name: Set up Ruby
|
11
|
+
uses: ruby/setup-ruby@v1
|
12
|
+
with:
|
13
|
+
ruby-version: 3.0
|
14
|
+
bundler-cache: true
|
15
|
+
- name: Build the gem
|
16
|
+
run: bundle exec rake build
|
@@ -0,0 +1,22 @@
|
|
1
|
+
name: RSpec
|
2
|
+
|
3
|
+
on: [ push, pull_request ]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
# Maintained versions: 2.7, 3.0, 3.1
|
11
|
+
# Security updates only: 2.6 (EOL: 2022-03-31)
|
12
|
+
# Source: https://www.ruby-lang.org/en/downloads/branches/
|
13
|
+
ruby: [ 2.6, 2.7, 3.0, 3.1 ]
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v2
|
16
|
+
- name: Set up Ruby
|
17
|
+
uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
bundler-cache: true
|
21
|
+
- name: Run the tests
|
22
|
+
run: bundle exec rspec
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: Code Quality
|
2
|
+
on: [ pull_request ]
|
3
|
+
jobs:
|
4
|
+
rubocop:
|
5
|
+
name: Rubocop
|
6
|
+
runs-on: ubuntu-latest
|
7
|
+
steps:
|
8
|
+
- name: Check out code
|
9
|
+
uses: actions/checkout@v2
|
10
|
+
- name: Install Ruby & Gems
|
11
|
+
uses: ruby/setup-ruby@v1 # Uses .ruby-version as version input
|
12
|
+
with:
|
13
|
+
ruby-version: 3.0
|
14
|
+
bundler-cache: true
|
15
|
+
- name: Rubocop
|
16
|
+
# https://github.com/reviewdog/action-rubocop
|
17
|
+
uses: reviewdog/action-rubocop@v2
|
18
|
+
with:
|
19
|
+
fail_on_error: true
|
20
|
+
filter_mode: nofilter
|
21
|
+
github_token: ${{ secrets.github_token }}
|
22
|
+
reporter: github-pr-review
|
23
|
+
rubocop_version: gemfile
|
24
|
+
# Rely on Bundler-installed gems so don't install them again
|
25
|
+
use_bundler: true
|
26
|
+
skip_install: true
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rspec
|
3
|
+
- rubocop-rake
|
4
|
+
- rubocop-performance
|
5
|
+
|
6
|
+
AllCops:
|
7
|
+
# We bump the version to get new cops, so enable them by default
|
8
|
+
NewCops: enable
|
9
|
+
|
10
|
+
# Two lines should fit next to each other in split view on a widescreen
|
11
|
+
Layout/LineLength:
|
12
|
+
Max: 120
|
13
|
+
|
14
|
+
# We still have old-style rspec checks, so this triggers on functional comparisons there
|
15
|
+
Lint/Void:
|
16
|
+
Exclude:
|
17
|
+
- 'spec/**/*_spec.rb'
|
18
|
+
|
19
|
+
# RSpec has a lot of blocks, so ignore this rule there
|
20
|
+
Metrics/BlockLength:
|
21
|
+
Exclude:
|
22
|
+
- 'spec/**/*_spec.rb'
|
23
|
+
|
24
|
+
# I prefer to see the class name over "described_class"
|
25
|
+
RSpec/DescribedClass:
|
26
|
+
EnforcedStyle: explicit
|
27
|
+
|
28
|
+
# I prefer groups for structure, so the defaults are a little too strict for me
|
29
|
+
RSpec/NestedGroups:
|
30
|
+
Max: 4
|
31
|
+
|
32
|
+
# I prefer more verbose examples, so tend to use more lines than the defaults
|
33
|
+
RSpec/ExampleLength:
|
34
|
+
Max: 20
|
35
|
+
|
36
|
+
# For strings I enjoy using %w[], but for symbols the %i[] syntax just does not click.
|
37
|
+
Style/SymbolArray:
|
38
|
+
EnforcedStyle: brackets
|
39
|
+
|
40
|
+
# Indentation is something I've got strong opinions about which differ from Rubocop.
|
41
|
+
Layout/ArgumentAlignment:
|
42
|
+
EnforcedStyle: with_fixed_indentation # default is with_first_argument
|
43
|
+
Layout/ArrayAlignment:
|
44
|
+
EnforcedStyle: with_fixed_indentation # default is with_first_element
|
45
|
+
Layout/FirstArgumentIndentation:
|
46
|
+
EnforcedStyle: consistent # default is special_for_inner_method_call_in_parentheses
|
47
|
+
Layout/FirstArrayElementIndentation:
|
48
|
+
EnforcedStyle: consistent # default is special_inside_parentheses
|
49
|
+
Layout/FirstHashElementIndentation:
|
50
|
+
EnforcedStyle: consistent # default is special_inside_parentheses
|
51
|
+
# Let's enforce this to be consistent
|
52
|
+
Layout/EndOfLine:
|
53
|
+
EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
## Unreleased changes
|
2
|
+
|
3
|
+
## Version 0.5.0 -- 2022-02-16
|
4
|
+
|
5
|
+
This release has breaking changes (deprecation cleanup and internals rework), a new feature (smart weights!) and is officially tested on Ruby 3.1.0 (it's what I use). I've enabled the setting to require MFA to publish this gem, to help protect those who use it.
|
6
|
+
|
7
|
+
- Breaking: remove `String#tokenize` core extension; please use `Groupie.tokenize(string)` instead [#39](https://github.com/Narnach/groupie/pull/39)
|
8
|
+
- Breaking: due to changed internals, YAML serialized data from 0.4.x will lack some of the new internal caches. I'd suggest loading the old data and adding the words from each group to a new (0.5.x) instance of Groupie. [#40](https://github.com/Narnach/groupie/pull/40)
|
9
|
+
- Feat: add support for smart default weights, reducing the effect of low data on predictions [#40](https://github.com/Narnach/groupie/pull/40)
|
10
|
+
- Deps: add Ruby 3.1 to list of tested & supported gems
|
11
|
+
- Chore: require multi-factor authentication to publish gem updates
|
12
|
+
- Chore: add Security.md to advertise a security policy
|
13
|
+
- Style: addressed Lint/AmbiguousOperatorPrecedence
|
14
|
+
- Dev: bump development dependencies multiple times
|
15
|
+
- Dev: switch to DepFu to manage development dependencies
|
16
|
+
|
17
|
+
## Version 0.4.1 -- 2021-09-08
|
18
|
+
|
19
|
+
Non-functional fixes to the CI config and Rubygems.org metadata.
|
20
|
+
|
21
|
+
- Fix: correct changelog uri for gem
|
22
|
+
- CI: fix dependabot config
|
23
|
+
|
24
|
+
## Version 0.4.0 -- 2021-09-07
|
25
|
+
|
26
|
+
Welcome to 2021, where Ruby version 2.6 is the lowest with official support, Bundler is the default for managing packages and RSpec version 3 is used to test things. This version updates Groupie into this decade.
|
27
|
+
|
28
|
+
- Refactor: update Groupie to 2021 standards
|
29
|
+
- Feat: raise Groupie::Error instead of RuntimeError
|
30
|
+
- Feat: deprecate String#tokenize in favor of Groupie.tokenize
|
31
|
+
- Doc: document API of Groupie
|
32
|
+
- Doc: update readme with examples
|
33
|
+
- Refactor: reorder Groupie methods by importance
|
34
|
+
- Refactor: simplify Groupie#classify
|
35
|
+
- Refactor: reduce complexity of Groupie#unique_words
|
36
|
+
- Refactor: simplify Groupie#classify\_text
|
37
|
+
|
38
|
+
## Version 0.3.0 -- 2010-07-29
|
39
|
+
|
40
|
+
Multiple changes to the 'unique words' strategy, hopefully improving the behavior.
|
41
|
+
|
42
|
+
- Cache unique words in an instance var to reduce time required to do subsequent lookups
|
43
|
+
- Sanity spec
|
44
|
+
- Unique strategy now includes all words except for the global 4th quartile
|
45
|
+
- Unique strategy changed yet again: only ignore words that occur more than their group's median
|
46
|
+
- Unique strategy now behaves like sqrt that only checks unique words
|
47
|
+
- Unique word finder uses less elegant but (hopefully) faster code
|
48
|
+
- Removed gemspec
|
49
|
+
|
50
|
+
## Version 0.2.3 -- 2010-07-29
|
51
|
+
|
52
|
+
Add a new 'unique words' strategy, which ignores words that occur in all categories.
|
53
|
+
|
54
|
+
- Added 'unique' classification strategy
|
55
|
+
- Added Group#<< as alias for Group#add
|
56
|
+
- Updated readme
|
57
|
+
|
58
|
+
## Version 0.2.2 -- 2010-07-25
|
59
|
+
|
60
|
+
Bugfix for log10 strategy.
|
61
|
+
|
62
|
+
- Fixed log10 strategy counting for Groupie.classify
|
63
|
+
|
64
|
+
## Version 0.2.1 -- 2010-07-25
|
65
|
+
|
66
|
+
Offer multiple ways to weigh word counts in calculating final scores.
|
67
|
+
|
68
|
+
- Added sqrt and log word counting strategies
|
69
|
+
|
70
|
+
## Version 0.2.0 -- 2010-07-25
|
71
|
+
|
72
|
+
Classification can't raise division by zero errors anymore.
|
73
|
+
|
74
|
+
- Groupie.classify_text ignores unclassified tokens
|
75
|
+
|
76
|
+
## Version 0.1.1 -- 2010-07-25
|
77
|
+
|
78
|
+
Swap test framework and tokenization improvements.
|
79
|
+
|
80
|
+
- Regenerated gemspec
|
81
|
+
- Strip quotes from tokens
|
82
|
+
- Replaced testy tests with rspec
|
83
|
+
|
84
|
+
## Version 0.1.0 -- 2010-07-25
|
85
|
+
|
86
|
+
The initial release as a gem, after working on this on/off over a year.
|
87
|
+
|
88
|
+
- Added gemspec
|
89
|
+
- Fixed text classification to properly average group scores
|
90
|
+
- Added test for classifying tokenized html email spam
|
91
|
+
- Classification of texts is now possible
|
92
|
+
- Added readme and MIT license
|
93
|
+
- Test the full html and headers of tokenized emails
|
94
|
+
- Support infix commas for tokenized strings
|
95
|
+
- Allow infix dots in tokenized strings
|
96
|
+
- Strip HTML tags when sanitizing a string
|
97
|
+
- Classify common words based on tokenized text from spam.la e-mails
|
98
|
+
- Added String#tokenize
|
99
|
+
- Ensure a Group will still work when loaded from YAML
|
100
|
+
- Added test helper file
|
101
|
+
- Refactored Group to maintain a Hash of words and counts instead of a list of words
|
102
|
+
- Removed obsolete method
|
103
|
+
- Added testcase for three groups
|
104
|
+
- Support multiple examples to add more weight to their grouping
|
105
|
+
- Renamed tests to reflect intent of content
|
106
|
+
- Classification now allows for a degree of certainty
|
107
|
+
- Implemented simple spam check
|
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source 'https://rubygems.org'
|
4
|
+
|
5
|
+
# Specify your gem's dependencies in groupie.gemspec
|
6
|
+
gemspec
|
7
|
+
|
8
|
+
gem 'psych', '~> 4.0'
|
9
|
+
gem 'rake', '~> 13.0'
|
10
|
+
gem 'rspec', '~> 3.0'
|
11
|
+
gem 'rubocop', '~> 1.7'
|
12
|
+
gem 'rubocop-performance', '~> 1.11'
|
13
|
+
gem 'rubocop-rake', '~> 0.6.0'
|
14
|
+
gem 'rubocop-rspec', '~> 2.4'
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
groupie (0.5.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.2)
|
10
|
+
diff-lcs (1.5.0)
|
11
|
+
parallel (1.21.0)
|
12
|
+
parser (3.1.0.0)
|
13
|
+
ast (~> 2.4.1)
|
14
|
+
psych (4.0.3)
|
15
|
+
stringio
|
16
|
+
rainbow (3.1.1)
|
17
|
+
rake (13.0.6)
|
18
|
+
regexp_parser (2.2.1)
|
19
|
+
rexml (3.2.5)
|
20
|
+
rspec (3.11.0)
|
21
|
+
rspec-core (~> 3.11.0)
|
22
|
+
rspec-expectations (~> 3.11.0)
|
23
|
+
rspec-mocks (~> 3.11.0)
|
24
|
+
rspec-core (3.11.0)
|
25
|
+
rspec-support (~> 3.11.0)
|
26
|
+
rspec-expectations (3.11.0)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.11.0)
|
29
|
+
rspec-mocks (3.11.0)
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
31
|
+
rspec-support (~> 3.11.0)
|
32
|
+
rspec-support (3.11.0)
|
33
|
+
rubocop (1.25.1)
|
34
|
+
parallel (~> 1.10)
|
35
|
+
parser (>= 3.1.0.0)
|
36
|
+
rainbow (>= 2.2.2, < 4.0)
|
37
|
+
regexp_parser (>= 1.8, < 3.0)
|
38
|
+
rexml
|
39
|
+
rubocop-ast (>= 1.15.1, < 2.0)
|
40
|
+
ruby-progressbar (~> 1.7)
|
41
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
+
rubocop-ast (1.15.2)
|
43
|
+
parser (>= 3.0.1.1)
|
44
|
+
rubocop-performance (1.13.2)
|
45
|
+
rubocop (>= 1.7.0, < 2.0)
|
46
|
+
rubocop-ast (>= 0.4.0)
|
47
|
+
rubocop-rake (0.6.0)
|
48
|
+
rubocop (~> 1.0)
|
49
|
+
rubocop-rspec (2.8.0)
|
50
|
+
rubocop (~> 1.19)
|
51
|
+
ruby-progressbar (1.11.0)
|
52
|
+
stringio (3.0.1)
|
53
|
+
unicode-display_width (2.1.0)
|
54
|
+
|
55
|
+
PLATFORMS
|
56
|
+
x86_64-darwin-20
|
57
|
+
x86_64-darwin-21
|
58
|
+
x86_64-linux
|
59
|
+
|
60
|
+
DEPENDENCIES
|
61
|
+
groupie!
|
62
|
+
psych (~> 4.0)
|
63
|
+
rake (~> 13.0)
|
64
|
+
rspec (~> 3.0)
|
65
|
+
rubocop (~> 1.7)
|
66
|
+
rubocop-performance (~> 1.11)
|
67
|
+
rubocop-rake (~> 0.6.0)
|
68
|
+
rubocop-rspec (~> 2.4)
|
69
|
+
|
70
|
+
BUNDLED WITH
|
71
|
+
2.3.4
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2009-2021 Wes Oldenbeuving
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
# Groupie
|
2
|
+
|
3
|
+
[![Depfu](https://badges.depfu.com/badges/367956233b3b31a6fc19db4515263b9e/overview.svg)](https://depfu.com/github/Narnach/groupie?project_id=34004)
|
4
|
+
|
5
|
+
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
6
|
+
|
7
|
+
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
8
|
+
|
9
|
+
Started and forgotten in 2009 as a short-lived experiment, in 2010 Groupie got new features when I started using it on a RSS reader project that classified news items into "Interesting" and "Not interesting" categories.
|
10
|
+
|
11
|
+
## Current functionality
|
12
|
+
|
13
|
+
Current funcionality includes:
|
14
|
+
|
15
|
+
* Tokenize an input text to prepare it for grouping.
|
16
|
+
* Strip XML and HTML tag.
|
17
|
+
* Keep certain infix characters, such as period and comma.
|
18
|
+
* Add texts (as an Array of Strings) to any number of groups.
|
19
|
+
* Classify a single word to check the likelihood it belongs to each group.
|
20
|
+
* Do classification for complete (tokenized) texts.
|
21
|
+
* Pick classification strategy to weigh repeat words differently (weigh by sum, square root or log10 of words in group)
|
22
|
+
|
23
|
+
## Installation
|
24
|
+
|
25
|
+
Add this line to your application's Gemfile:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
gem 'groupie'
|
29
|
+
```
|
30
|
+
|
31
|
+
You can also perform this to do this for you:
|
32
|
+
|
33
|
+
bundle add groupie
|
34
|
+
|
35
|
+
And then execute:
|
36
|
+
|
37
|
+
bundle install
|
38
|
+
|
39
|
+
Or install it system-wide via:
|
40
|
+
|
41
|
+
gem install groupie
|
42
|
+
|
43
|
+
## Usage
|
44
|
+
|
45
|
+
Here is an annotated console session that shows off the features available in Groupie.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
# Instantiate a new Groupie instance
|
49
|
+
groupie = Groupie.new
|
50
|
+
|
51
|
+
# Groups are defined as you use them, so let's get started by adding some pre-tokenized words
|
52
|
+
groupie[:spam].add(%w[this is obvious spam please buy our product])
|
53
|
+
groupie[:spam].add(%w[hello friend this is rich prince i have awesome bitcoin for you])
|
54
|
+
groupie[:ham].add(%w[you are invited to my awesome party just click the link to rsvp])
|
55
|
+
|
56
|
+
# Is your data less than clean? We've got a tokenizer for that!
|
57
|
+
tokens = Groupie.tokenize('Please give me your password so I can haxx0r you!')
|
58
|
+
# => ["please", "give", "me", "your", "password", "so", "i", "can", "haxx0r", "you"]
|
59
|
+
groupie[:spam].add(tokens)
|
60
|
+
|
61
|
+
# So, now let's attempt to classify a text and see if it's spam or ham:
|
62
|
+
test_tokens = %w[please click the link to reset your password for our awesome product]
|
63
|
+
groupie.classify_text(test_tokens)
|
64
|
+
# => {:spam=>0.5909090909090909, :ham=>0.4090909090909091}
|
65
|
+
# As you can see, this password reset email looks a little dodgy...
|
66
|
+
# We have multiple strategies for drawing conclusions about what group it belongs to.
|
67
|
+
# The default you saw above is :sum, it weighs each word by the total sum of occurrences.
|
68
|
+
# Let's see if it looks less bad by using a different classification strategies.
|
69
|
+
|
70
|
+
# Log reduces the weight of each word to the log10 of its occurrence count:
|
71
|
+
# - Count 1 is weight 0
|
72
|
+
# - Count 10 is weight 1
|
73
|
+
# - Count 100 is weight 2
|
74
|
+
groupie.classify_text(test_tokens, :log)
|
75
|
+
# => {:spam=>0.5, :ham=>0.5}
|
76
|
+
# This is even more even, most likely because it ignores all single-count words...
|
77
|
+
|
78
|
+
# Square root algorithm is less harsh, it reduces the weight of each word to the square root of the count:
|
79
|
+
# - Count 1 is weight 1
|
80
|
+
# - Count 4 is weight 2
|
81
|
+
# - Count 9 is weight 3
|
82
|
+
groupie.classify_text(test_tokens, :sqrt)
|
83
|
+
# => {:spam=>0.5909090909090909, :ham=>0.4090909090909091}
|
84
|
+
# This seems to result in the same value as :sum
|
85
|
+
|
86
|
+
# Unique uses the same weighting algorithm as the square root, but it modifies the word dictionary:
|
87
|
+
# it discards the 25% most common words, so less common words gain higher predictive power.
|
88
|
+
groupie.classify_text(test_tokens, :unique)
|
89
|
+
# => {:spam=>0.625, :ham=>0.375}
|
90
|
+
# This looks even worse for our poor password reset email.
|
91
|
+
# In case you're curious, the ignored words in this case are:
|
92
|
+
test_tokens - (test_tokens & groupie.unique_words)
|
93
|
+
# => ["please", "to", "reset", "awesome"]
|
94
|
+
# If you'd be classifying email, you can assume that common email headers will get ignored this way.
|
95
|
+
|
96
|
+
# If you're just starting out, your incomplete data could lead to dramatic misrepresentations of the data.
|
97
|
+
# To balance against this, you can enable smart weight:
|
98
|
+
groupie.smart_weight = true
|
99
|
+
# You could also set it during initialization via Groupie.new(smart_weight: true)
|
100
|
+
# What's so useful about it? It adds a default weight to _all_ words, even the ones you haven't
|
101
|
+
# seen yet, which counter-acts the data you have. This shines in low data situations,
|
102
|
+
# reducing the impact of the few words you have seen before.
|
103
|
+
groupie.default_weight
|
104
|
+
# => 1.2285714285714286
|
105
|
+
# Classifying the same text as before should consider all words, and add this default weight to all words
|
106
|
+
# It basically gives all groups the likelihood of "claiming" a word,
|
107
|
+
# unless there is strong data to suggest otherwise.
|
108
|
+
groupie.classify_text(test_tokens)
|
109
|
+
# => {:spam=>0.5241046831955923, :ham=>0.4758953168044077}
|
110
|
+
```
|
111
|
+
|
112
|
+
Persistence can be naively done by using YAML:
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
# Instantiate a new Groupie instance
|
116
|
+
groupie = Groupie.new
|
117
|
+
groupie[:spam].add(%w[assume you have a lot of data you care about])
|
118
|
+
|
119
|
+
require 'yaml'
|
120
|
+
yaml = YAML.dump(groupie)
|
121
|
+
loaded = YAML.safe_load(yaml, permitted_classes: [Groupie, Groupie::Group, Symbol])
|
122
|
+
```
|
123
|
+
|
124
|
+
For I'm still experimenting with Groupie in [Infinity Feed](https://www.infinity-feed.com), so persistence is a Future Problem for me there. In development, I'm building (low data count) classifiers in memory and discarding them after use.
|
125
|
+
|
126
|
+
## Development
|
127
|
+
|
128
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. Rubocop is available via `bin/rubocop` with some friendly default settings.
|
129
|
+
|
130
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
131
|
+
|
132
|
+
To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). For obvious reasons, only the project maintainer can do this.
|
133
|
+
|
134
|
+
## Contributing
|
135
|
+
|
136
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/Narnach/groupie.
|
137
|
+
|
138
|
+
## License
|
139
|
+
|
140
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
@@ -1,53 +1,12 @@
|
|
1
|
-
|
2
|
-
require 'rake'
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "groupie"
|
8
|
-
gem.summary = %Q{Group and classify text}
|
9
|
-
gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
|
10
|
-
gem.email = "narnach@gmail.com"
|
11
|
-
gem.homepage = "http://github.com/Narnach/groupie"
|
12
|
-
gem.authors = ["Wes Oldenbeuving"]
|
13
|
-
gem.add_development_dependency "testy", ">= 0"
|
14
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
-
end
|
16
|
-
Jeweler::GemcutterTasks.new
|
17
|
-
rescue LoadError
|
18
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
-
end
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
20
5
|
|
21
|
-
|
22
|
-
Rake::TestTask.new(:spec) do |test|
|
23
|
-
test.libs << 'lib' << 'spec'
|
24
|
-
test.pattern = 'spec/**/*_spec.rb'
|
25
|
-
test.verbose = true
|
26
|
-
end
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
27
7
|
|
28
|
-
|
29
|
-
require 'rcov/rcovtask'
|
30
|
-
Rcov::RcovTask.new do |test|
|
31
|
-
test.libs << 'spec'
|
32
|
-
test.pattern = 'spec/**/*_spec.rb'
|
33
|
-
test.verbose = true
|
34
|
-
end
|
35
|
-
rescue LoadError
|
36
|
-
task :rcov do
|
37
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
-
end
|
39
|
-
end
|
8
|
+
require 'rubocop/rake_task'
|
40
9
|
|
41
|
-
|
10
|
+
RuboCop::RakeTask.new
|
42
11
|
|
43
|
-
task :
|
44
|
-
|
45
|
-
require 'rake/rdoctask'
|
46
|
-
Rake::RDocTask.new do |rdoc|
|
47
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
-
|
49
|
-
rdoc.rdoc_dir = 'rdoc'
|
50
|
-
rdoc.title = "groupie #{version}"
|
51
|
-
rdoc.rdoc_files.include('readme*')
|
52
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
-
end
|
12
|
+
task default: [:spec, :rubocop]
|
data/SECURITY.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Security Policy
|
2
|
+
|
3
|
+
Groupie is inherently not a user-facing library, so possible vectors for exploitation seem small to me.
|
4
|
+
That said, in the event of a security vulnerability being found, this document describes how to report it.
|
5
|
+
|
6
|
+
## Supported Versions
|
7
|
+
|
8
|
+
As a small library with infrequent updates, I will accept bug and security reports for the current minor version.
|
9
|
+
Severe issues might be backported to previous minor versions. I'll handle this on a case-by-case basis.
|
10
|
+
|
11
|
+
## Reporting a Vulnerability
|
12
|
+
|
13
|
+
For low-risk things you can create an issue or PR.
|
14
|
+
In case of a high risk thing, you can email me at security@narnach.com.
|
15
|
+
|
16
|
+
## Thanks
|
17
|
+
|
18
|
+
Once we have successfully handled a security vulnerability, we'll add an attribution to the list below.
|
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'groupie'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/rubocop
ADDED