groupie 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +24 -0
- data/.github/dependabot.yml +1 -9
- data/.github/workflows/rspec.yml +4 -3
- data/.rubocop.yml +15 -0
- data/CHANGELOG.md +14 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +31 -27
- data/README.md +20 -1
- data/SECURITY.md +18 -0
- data/groupie.gemspec +2 -0
- data/lib/groupie/group.rb +9 -3
- data/lib/groupie/version.rb +1 -1
- data/lib/groupie.rb +50 -17
- metadata +7 -4
- data/lib/groupie/core_ext/string.rb +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd7e47aef7d4ed19c206e46d5eb716991562ee2c257c5cf8488341f646801f2f
|
4
|
+
data.tar.gz: 692108e3c8c2b7d4b26a3c7d702133d0a5e01324d0c42cd9c4143b3a4b601b6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dba999961a8c6d7ba2999770d9125dc1c4b2c6468452c115a0eeb3256d8d484ef0cba8d975053e5e4e7d907afaf402063fb0b75c6d1c24b5bd3dc7698a251f35
|
7
|
+
data.tar.gz: cb5d99d029d237a37354b81b97a4d0a03b6224040ddbbbd0cd96cb3536ea0cfe4ecfb55871e35749b133df0178c97937f2b98523dc3504e4620b3a2473403478
|
@@ -0,0 +1,32 @@
|
|
1
|
+
---
|
2
|
+
name: Bug report
|
3
|
+
about: Create a report to help us improve
|
4
|
+
title: ''
|
5
|
+
labels: ''
|
6
|
+
assignees: ''
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
## Describe the bug
|
11
|
+
|
12
|
+
A clear and concise description of what the bug is.
|
13
|
+
|
14
|
+
## How to reproduce it
|
15
|
+
|
16
|
+
Include the minimum code sample required to reproduce the bug.
|
17
|
+
|
18
|
+
## Expected behavior
|
19
|
+
|
20
|
+
A clear and concise description of what you expected to happen.
|
21
|
+
|
22
|
+
## Context
|
23
|
+
|
24
|
+
Please describe:
|
25
|
+
|
26
|
+
* What version of Groupie did you use? (`bundle list|grep groupie`)
|
27
|
+
* What version of Ruby did you use? (`ruby -v`)
|
28
|
+
* What platform and architecture do you use? (macOS version (Intel vs Apple M1), Windows version, Linux distro/architecture)
|
29
|
+
|
30
|
+
## Additional context
|
31
|
+
|
32
|
+
Add any other context about the problem here.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
---
|
2
|
+
name: Feature request
|
3
|
+
about: Suggest an idea for this project
|
4
|
+
title: ''
|
5
|
+
labels: ''
|
6
|
+
assignees: ''
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
## Is your feature request related to a problem? Please describe.
|
11
|
+
|
12
|
+
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
13
|
+
|
14
|
+
## Describe the solution you'd like
|
15
|
+
|
16
|
+
A clear and concise description of what you want to happen.
|
17
|
+
|
18
|
+
## Describe alternatives you've considered
|
19
|
+
|
20
|
+
A clear and concise description of any alternative solutions or features you've considered.
|
21
|
+
|
22
|
+
## Additional context
|
23
|
+
|
24
|
+
Add any other context or screenshots about the feature request here.
|
data/.github/dependabot.yml
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# Check every week (Monday 5:00 CET by default) for updates.
|
4
4
|
# Each package manager gets 5 non-security updates by default.
|
5
5
|
# Security updates bypass most configuration here and show up when found.
|
6
|
+
# We rely on DepFu for Ruby and Gem updates
|
6
7
|
version: 2
|
7
8
|
updates:
|
8
9
|
- package-ecosystem: github-actions
|
@@ -10,12 +11,3 @@ updates:
|
|
10
11
|
directory: '/'
|
11
12
|
schedule:
|
12
13
|
interval: weekly
|
13
|
-
- package-ecosystem: bundler
|
14
|
-
# Bundler handles Ruby dependencies
|
15
|
-
directory: '/'
|
16
|
-
schedule:
|
17
|
-
interval: weekly
|
18
|
-
versioning-strategy: increase-if-necessary
|
19
|
-
allow:
|
20
|
-
- dependency-type: direct
|
21
|
-
- dependency-type: indirect
|
data/.github/workflows/rspec.yml
CHANGED
@@ -7,9 +7,10 @@ jobs:
|
|
7
7
|
runs-on: ubuntu-latest
|
8
8
|
strategy:
|
9
9
|
matrix:
|
10
|
-
# Maintained versions: 2.7
|
11
|
-
# Security updates only: 2.6
|
12
|
-
|
10
|
+
# Maintained versions: 2.7, 3.0, 3.1
|
11
|
+
# Security updates only: 2.6 (EOL: 2022-03-31)
|
12
|
+
# Source: https://www.ruby-lang.org/en/downloads/branches/
|
13
|
+
ruby: [ 2.6, 2.7, 3.0, 3.1 ]
|
13
14
|
steps:
|
14
15
|
- uses: actions/checkout@v2
|
15
16
|
- name: Set up Ruby
|
data/.rubocop.yml
CHANGED
@@ -36,3 +36,18 @@ RSpec/ExampleLength:
|
|
36
36
|
# For strings I enjoy using %w[], but for symbols the %i[] syntax just does not click.
|
37
37
|
Style/SymbolArray:
|
38
38
|
EnforcedStyle: brackets
|
39
|
+
|
40
|
+
# Indentation is something I've got strong opinions about which differ from Rubocop.
|
41
|
+
Layout/ArgumentAlignment:
|
42
|
+
EnforcedStyle: with_fixed_indentation # default is with_first_argument
|
43
|
+
Layout/ArrayAlignment:
|
44
|
+
EnforcedStyle: with_fixed_indentation # default is with_first_element
|
45
|
+
Layout/FirstArgumentIndentation:
|
46
|
+
EnforcedStyle: consistent # default is special_for_inner_method_call_in_parentheses
|
47
|
+
Layout/FirstArrayElementIndentation:
|
48
|
+
EnforcedStyle: consistent # default is special_inside_parentheses
|
49
|
+
Layout/FirstHashElementIndentation:
|
50
|
+
EnforcedStyle: consistent # default is special_inside_parentheses
|
51
|
+
# Let's enforce this to be consistent
|
52
|
+
Layout/EndOfLine:
|
53
|
+
EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
## Unreleased changes
|
2
2
|
|
3
|
+
## Version 0.5.0 -- 2022-02-16
|
4
|
+
|
5
|
+
This release has breaking changes (deprecation cleanup and internals rework), a new feature (smart weights!) and is officially tested on Ruby 3.1.0 (it's what I use). I've enabled the setting to require MFA to publish this gem, to help protect those who use it.
|
6
|
+
|
7
|
+
- Breaking: remove `String#tokenize` core extension; please use `Groupie.tokenize(string)` instead [#39](https://github.com/Narnach/groupie/pull/39)
|
8
|
+
- Breaking: due to changed internals, YAML serialized data from 0.4.x will lack some of the new internal caches. I'd suggest loading the old data and adding the words from each group to a new (0.5.x) instance of Groupie. [#40](https://github.com/Narnach/groupie/pull/40)
|
9
|
+
- Feat: add support for smart default weights, reducing the effect of low data on predictions [#40](https://github.com/Narnach/groupie/pull/40)
|
10
|
+
- Deps: add Ruby 3.1 to list of tested & supported gems
|
11
|
+
- Chore: require multi-factor authentication to publish gem updates
|
12
|
+
- Chore: add Security.md to advertise a security policy
|
13
|
+
- Style: addressed Lint/AmbiguousOperatorPrecedence
|
14
|
+
- Dev: bump development dependencies multiple times
|
15
|
+
- Dev: switch to DepFu to manage development dependencies
|
16
|
+
|
3
17
|
## Version 0.4.1 -- 2021-09-08
|
4
18
|
|
5
19
|
Non-functional fixes to the CI config and Rubygems.org metadata.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,61 +1,65 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
groupie (0.
|
4
|
+
groupie (0.5.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
|
-
diff-lcs (1.
|
11
|
-
parallel (1.
|
12
|
-
parser (3.0.
|
10
|
+
diff-lcs (1.5.0)
|
11
|
+
parallel (1.21.0)
|
12
|
+
parser (3.1.0.0)
|
13
13
|
ast (~> 2.4.1)
|
14
|
-
|
14
|
+
psych (4.0.3)
|
15
|
+
stringio
|
16
|
+
rainbow (3.1.1)
|
15
17
|
rake (13.0.6)
|
16
|
-
regexp_parser (2.
|
18
|
+
regexp_parser (2.2.1)
|
17
19
|
rexml (3.2.5)
|
18
|
-
rspec (3.
|
19
|
-
rspec-core (~> 3.
|
20
|
-
rspec-expectations (~> 3.
|
21
|
-
rspec-mocks (~> 3.
|
22
|
-
rspec-core (3.
|
23
|
-
rspec-support (~> 3.
|
24
|
-
rspec-expectations (3.
|
20
|
+
rspec (3.11.0)
|
21
|
+
rspec-core (~> 3.11.0)
|
22
|
+
rspec-expectations (~> 3.11.0)
|
23
|
+
rspec-mocks (~> 3.11.0)
|
24
|
+
rspec-core (3.11.0)
|
25
|
+
rspec-support (~> 3.11.0)
|
26
|
+
rspec-expectations (3.11.0)
|
25
27
|
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
-
rspec-support (~> 3.
|
27
|
-
rspec-mocks (3.
|
28
|
+
rspec-support (~> 3.11.0)
|
29
|
+
rspec-mocks (3.11.0)
|
28
30
|
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
-
rspec-support (~> 3.
|
30
|
-
rspec-support (3.
|
31
|
-
rubocop (1.
|
31
|
+
rspec-support (~> 3.11.0)
|
32
|
+
rspec-support (3.11.0)
|
33
|
+
rubocop (1.25.1)
|
32
34
|
parallel (~> 1.10)
|
33
|
-
parser (>= 3.
|
35
|
+
parser (>= 3.1.0.0)
|
34
36
|
rainbow (>= 2.2.2, < 4.0)
|
35
37
|
regexp_parser (>= 1.8, < 3.0)
|
36
38
|
rexml
|
37
|
-
rubocop-ast (>= 1.
|
39
|
+
rubocop-ast (>= 1.15.1, < 2.0)
|
38
40
|
ruby-progressbar (~> 1.7)
|
39
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
40
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.15.2)
|
41
43
|
parser (>= 3.0.1.1)
|
42
|
-
rubocop-performance (1.
|
44
|
+
rubocop-performance (1.13.2)
|
43
45
|
rubocop (>= 1.7.0, < 2.0)
|
44
46
|
rubocop-ast (>= 0.4.0)
|
45
47
|
rubocop-rake (0.6.0)
|
46
48
|
rubocop (~> 1.0)
|
47
|
-
rubocop-rspec (2.
|
48
|
-
rubocop (~> 1.
|
49
|
-
rubocop-ast (>= 1.1.0)
|
49
|
+
rubocop-rspec (2.8.0)
|
50
|
+
rubocop (~> 1.19)
|
50
51
|
ruby-progressbar (1.11.0)
|
51
|
-
|
52
|
+
stringio (3.0.1)
|
53
|
+
unicode-display_width (2.1.0)
|
52
54
|
|
53
55
|
PLATFORMS
|
54
56
|
x86_64-darwin-20
|
57
|
+
x86_64-darwin-21
|
55
58
|
x86_64-linux
|
56
59
|
|
57
60
|
DEPENDENCIES
|
58
61
|
groupie!
|
62
|
+
psych (~> 4.0)
|
59
63
|
rake (~> 13.0)
|
60
64
|
rspec (~> 3.0)
|
61
65
|
rubocop (~> 1.7)
|
@@ -64,4 +68,4 @@ DEPENDENCIES
|
|
64
68
|
rubocop-rspec (~> 2.4)
|
65
69
|
|
66
70
|
BUNDLED WITH
|
67
|
-
2.
|
71
|
+
2.3.4
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# Groupie
|
2
2
|
|
3
|
+
[![Depfu](https://badges.depfu.com/badges/367956233b3b31a6fc19db4515263b9e/overview.svg)](https://depfu.com/github/Narnach/groupie?project_id=34004)
|
4
|
+
|
3
5
|
Groupie is a simple way to group texts and classify new texts as being a likely member of one of the defined groups. Think of bayesian spam filters.
|
4
6
|
|
5
7
|
The eventual goal is to have Groupie work as a sort of bayesian spam filter, where you feed it spam and ham (non-spam) and ask it to classify new texts as spam or ham. Applications for this are e-mail spam filtering and blog spam filtering. Other sorts of categorizing might be interesting as well, such as finding suitable tags for a blog post or bookmark.
|
@@ -90,6 +92,21 @@ groupie.classify_text(test_tokens, :unique)
|
|
90
92
|
test_tokens - (test_tokens & groupie.unique_words)
|
91
93
|
# => ["please", "to", "reset", "awesome"]
|
92
94
|
# If you'd be classifying email, you can assume that common email headers will get ignored this way.
|
95
|
+
|
96
|
+
# If you're just starting out, your incomplete data could lead to dramatic misrepresentations of the data.
|
97
|
+
# To balance against this, you can enable smart weight:
|
98
|
+
groupie.smart_weight = true
|
99
|
+
# You could also set it during initialization via Groupie.new(smart_weight: true)
|
100
|
+
# What's so useful about it? It adds a default weight to _all_ words, even the ones you haven't
|
101
|
+
# seen yet, which counter-acts the data you have. This shines in low data situations,
|
102
|
+
# reducing the impact of the few words you have seen before.
|
103
|
+
groupie.default_weight
|
104
|
+
# => 1.2285714285714286
|
105
|
+
# Classifying the same text as before should consider all words, and add this default weight to all words
|
106
|
+
# It basically gives all groups the likelihood of "claiming" a word,
|
107
|
+
# unless there is strong data to suggest otherwise.
|
108
|
+
groupie.classify_text(test_tokens)
|
109
|
+
# => {:spam=>0.5241046831955923, :ham=>0.4758953168044077}
|
93
110
|
```
|
94
111
|
|
95
112
|
Persistence can be naively done by using YAML:
|
@@ -110,7 +127,9 @@ For I'm still experimenting with Groupie in [Infinity Feed](https://www.infinity
|
|
110
127
|
|
111
128
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. Rubocop is available via `bin/rubocop` with some friendly default settings.
|
112
129
|
|
113
|
-
To install this gem onto your local machine, run `bundle exec rake install`.
|
130
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
131
|
+
|
132
|
+
To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). For obvious reasons, only the project maintainer can do this.
|
114
133
|
|
115
134
|
## Contributing
|
116
135
|
|
data/SECURITY.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Security Policy
|
2
|
+
|
3
|
+
Groupie is inherently not a user-facing library, so possible vectors for exploitation seem small to me.
|
4
|
+
That said, in the event of a security vulnerability being found, this document describes how to report it.
|
5
|
+
|
6
|
+
## Supported Versions
|
7
|
+
|
8
|
+
As a small library with infrequent updates, I will accept bug and security reports for the current minor version.
|
9
|
+
Severe issues might be backported to previous minor versions. I'll handle this on a case-by-case basis.
|
10
|
+
|
11
|
+
## Reporting a Vulnerability
|
12
|
+
|
13
|
+
For low-risk things you can create an issue or PR.
|
14
|
+
In case of a high risk thing, you can email me at security@narnach.com.
|
15
|
+
|
16
|
+
## Thanks
|
17
|
+
|
18
|
+
Once we have successfully handled a security vulnerability, we'll add an attribution to the list below.
|
data/groupie.gemspec
CHANGED
@@ -18,6 +18,8 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
19
|
spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
|
20
20
|
spec.metadata['changelog_uri'] = 'https://github.com/Narnach/groupie/blob/stable/CHANGELOG.md'
|
21
|
+
# Require multi-factor authentication to publish the gem
|
22
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
21
23
|
|
22
24
|
# Specify which files should be added to the gem when it is released.
|
23
25
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
data/lib/groupie/group.rb
CHANGED
@@ -3,11 +3,13 @@
|
|
3
3
|
class Groupie
|
4
4
|
# Group represents a group or category that words can be classified into.
|
5
5
|
class Group
|
6
|
-
attr_reader :word_counts
|
6
|
+
attr_reader :word_counts, :total_word_count
|
7
7
|
|
8
|
-
def initialize(name)
|
8
|
+
def initialize(name, groupie)
|
9
9
|
@name = name
|
10
|
+
@groupie = groupie
|
10
11
|
@word_counts = {}
|
12
|
+
@total_word_count = 0
|
11
13
|
end
|
12
14
|
|
13
15
|
def words
|
@@ -34,7 +36,11 @@ class Groupie
|
|
34
36
|
# Add a single word and count it.
|
35
37
|
def add_word(word)
|
36
38
|
@word_counts[word] ||= 0
|
37
|
-
@word_counts[word] += 1
|
39
|
+
current_count = @word_counts[word] += 1
|
40
|
+
@total_word_count += 1
|
41
|
+
# If this word is new for this Group, it might be new for the entire Groupie
|
42
|
+
@groupie.add_word(word) if current_count == 1
|
43
|
+
nil
|
38
44
|
end
|
39
45
|
end
|
40
46
|
end
|
data/lib/groupie/version.rb
CHANGED
data/lib/groupie.rb
CHANGED
@@ -2,15 +2,20 @@
|
|
2
2
|
|
3
3
|
require_relative 'groupie/version'
|
4
4
|
require_relative 'groupie/group'
|
5
|
-
|
5
|
+
require 'set'
|
6
6
|
|
7
7
|
# Groupie is a text grouper and classifier, using naive Bayesian filtering.
|
8
8
|
class Groupie
|
9
9
|
# Wrap all errors we raise in this so our own errors are recognizable.
|
10
10
|
class Error < StandardError; end
|
11
11
|
|
12
|
-
|
12
|
+
attr_accessor :smart_weight
|
13
|
+
|
14
|
+
# @param [true, false] smart_weight (false) Whether smart weight is enabled or not.
|
15
|
+
def initialize(smart_weight: false)
|
13
16
|
@groups = {}
|
17
|
+
@smart_weight = smart_weight
|
18
|
+
@known_words = Set.new
|
14
19
|
end
|
15
20
|
|
16
21
|
# Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
|
@@ -33,7 +38,7 @@ class Groupie
|
|
33
38
|
# @param [Object] group The name of the group to access.
|
34
39
|
# @return [Groupie::Group] An existing or new group identified by +group+.
|
35
40
|
def [](group)
|
36
|
-
@groups[group] ||= Group.new(group)
|
41
|
+
@groups[group] ||= Group.new(group, self)
|
37
42
|
end
|
38
43
|
|
39
44
|
# Classify a text by taking the average of all word classifications.
|
@@ -51,25 +56,27 @@ class Groupie
|
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
54
|
-
# Classify a single word against all groups.
|
59
|
+
# Classify a single word against all groups, returning the probability distribution.
|
55
60
|
#
|
56
61
|
# @param [String] entry A word to be classified
|
57
|
-
# @param [Symbol] strategy
|
58
|
-
# @return [Hash<Object, Float>] Hash with <group,
|
62
|
+
# @param [Symbol] strategy (:sum) the strategy to use on the score
|
63
|
+
# @return [Hash<Object, Float>] Hash with <group, probability> pairings.
|
64
|
+
# Probabilities are always in 0.0..1.0, and add up to 1.0 (i.e. it's a probability distribution)
|
59
65
|
# @raise [Groupie::Error] Raise when an invalid strategy is provided
|
60
66
|
def classify(entry, strategy = :sum)
|
61
|
-
|
62
|
-
|
63
|
-
|
67
|
+
# Calculate default weight once outside of the loop
|
68
|
+
default_weight = self.default_weight
|
69
|
+
# Each group calculates the count, then reduces it to a score: <group name, score>
|
70
|
+
per_group_score = @groups.transform_values do |group|
|
71
|
+
apply_count_strategy(default_weight + group.count(entry), strategy)
|
64
72
|
end
|
65
|
-
|
73
|
+
# When we have no scores, we have no results, so abort early
|
74
|
+
# Note that when smart_weight is enabled we always have a score.
|
75
|
+
total_score = per_group_score.values.sum
|
76
|
+
return {} if total_score.zero?
|
66
77
|
|
67
|
-
|
68
|
-
|
69
|
-
results[name] = count.positive? ? count.to_f / total_count : 0.0
|
70
|
-
end
|
71
|
-
|
72
|
-
results
|
78
|
+
# Final results must be within 0.0..1.0, so divide each score by the total score
|
79
|
+
per_group_score.transform_values { |group_score| group_score.to_f / total_score }
|
73
80
|
end
|
74
81
|
|
75
82
|
# Return a word score dictionary that excludes the 4th quartile most popular words.
|
@@ -85,13 +92,39 @@ class Groupie
|
|
85
92
|
total.merge!(group.word_counts) { |_key, o, n| o + n }
|
86
93
|
end
|
87
94
|
# Extract the word count that's at the top 75%
|
88
|
-
top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
|
95
|
+
top_quartile_index = [((total_count.size * 3) / 4) - 1, 1].max
|
89
96
|
top_quartile_frequency = total_count.values.sort[top_quartile_index]
|
90
97
|
# Throw out all words which have a count that's above this frequency
|
91
98
|
total_count.reject! { |_word, count| count > top_quartile_frequency }
|
92
99
|
total_count.keys
|
93
100
|
end
|
94
101
|
|
102
|
+
# Default weight is used when +smart_weight+ is enabled.
|
103
|
+
# Each word's count is increased by the +default_weight+ value,
|
104
|
+
# which is the average frequency of each unique word we know about.
|
105
|
+
#
|
106
|
+
# Example: if we have indexed 1000 total words, of which 500 were unique,
|
107
|
+
# the default_weight would be 1000/500=2.0
|
108
|
+
#
|
109
|
+
# @return [Float] The default weight for all words
|
110
|
+
def default_weight
|
111
|
+
# Default weight only applies when smart weight is enabled
|
112
|
+
return 0.0 unless smart_weight
|
113
|
+
|
114
|
+
# If we don't know any words, the weight is also zero
|
115
|
+
return 0.0 unless @known_words.any?
|
116
|
+
|
117
|
+
# Gather counts and calculate
|
118
|
+
total_words = @groups.each_value.sum(&:total_word_count)
|
119
|
+
total_unique_words = @known_words.count
|
120
|
+
total_words / total_unique_words.to_f
|
121
|
+
end
|
122
|
+
|
123
|
+
# Private method used by Groups to register known words with the Group.
|
124
|
+
def add_word(word)
|
125
|
+
@known_words << word
|
126
|
+
end
|
127
|
+
|
95
128
|
private
|
96
129
|
|
97
130
|
# Calculate grouped scores
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: groupie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wes Oldenbeuving
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Groupie is a simple way to group texts and classify new texts as being
|
14
14
|
a likely member of one of the defined groups. Think of bayesian spam filters.
|
@@ -18,6 +18,8 @@ executables: []
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
+
- ".github/ISSUE_TEMPLATE/bug_report.md"
|
22
|
+
- ".github/ISSUE_TEMPLATE/feature_request.md"
|
21
23
|
- ".github/dependabot.yml"
|
22
24
|
- ".github/workflows/gem.yml"
|
23
25
|
- ".github/workflows/rspec.yml"
|
@@ -31,12 +33,12 @@ files:
|
|
31
33
|
- LICENSE.txt
|
32
34
|
- README.md
|
33
35
|
- Rakefile
|
36
|
+
- SECURITY.md
|
34
37
|
- bin/console
|
35
38
|
- bin/rubocop
|
36
39
|
- bin/setup
|
37
40
|
- groupie.gemspec
|
38
41
|
- lib/groupie.rb
|
39
|
-
- lib/groupie/core_ext/string.rb
|
40
42
|
- lib/groupie/group.rb
|
41
43
|
- lib/groupie/version.rb
|
42
44
|
homepage: https://github.com/Narnach/groupie
|
@@ -46,6 +48,7 @@ metadata:
|
|
46
48
|
homepage_uri: https://github.com/Narnach/groupie
|
47
49
|
source_code_uri: https://github.com/Narnach/groupie
|
48
50
|
changelog_uri: https://github.com/Narnach/groupie/blob/stable/CHANGELOG.md
|
51
|
+
rubygems_mfa_required: 'true'
|
49
52
|
post_install_message:
|
50
53
|
rdoc_options: []
|
51
54
|
require_paths:
|
@@ -61,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
61
64
|
- !ruby/object:Gem::Version
|
62
65
|
version: '0'
|
63
66
|
requirements: []
|
64
|
-
rubygems_version: 3.
|
67
|
+
rubygems_version: 3.3.3
|
65
68
|
signing_key:
|
66
69
|
specification_version: 4
|
67
70
|
summary: Library to help you group texts and classify new ones
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class Groupie
|
4
|
-
module CoreExt
|
5
|
-
# This module monkey patches String to respond to #tokenize
|
6
|
-
module String
|
7
|
-
def tokenize
|
8
|
-
warn "Please use Groupie.tokenize instead of String#tokenize (from #{caller(1..1).first})"
|
9
|
-
Groupie.tokenize(self)
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
class String
|
16
|
-
include Groupie::CoreExt::String
|
17
|
-
end
|