groupie 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gem.yml +2 -2
- data/.github/workflows/rspec.yml +5 -5
- data/.github/workflows/rubocop.yml +4 -3
- data/.rubocop.yml +4 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +23 -15
- data/bin/rubocop +14 -1
- data/groupie.gemspec +3 -1
- data/lib/groupie/tokenizer.rb +85 -0
- data/lib/groupie/version.rb +1 -1
- data/lib/groupie.rb +3 -8
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0afe90008ac3d4c4defc95c8317349195f6e13a9190f0a41ec47aed1bdd31f2b
|
4
|
+
data.tar.gz: ad64128eaf725f8a8f22c86456fc3aec97a331e975578da8fa58082f983ed839
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12ef28e989b25de4d85e6eb4f5684907b1934b9ca83aa4efdafecdeffe2e27a9c8f33ff3ebd5a4a3c2efc6a0eeaa4d5d33480f481d5608d3fa586bba415377a5
|
7
|
+
data.tar.gz: bf563abf04440751e5c81a7b738313a19f1f7e8feb32fe9263f86635e5fa57c277e29ffbfa0a8bd195f1a3dfc74b1d4aab0460fa04746643162f9451b33fa275
|
data/.github/workflows/gem.yml
CHANGED
@@ -6,11 +6,11 @@ jobs:
|
|
6
6
|
build:
|
7
7
|
runs-on: ubuntu-latest
|
8
8
|
steps:
|
9
|
-
- uses: actions/checkout@
|
9
|
+
- uses: actions/checkout@v3
|
10
10
|
- name: Set up Ruby
|
11
11
|
uses: ruby/setup-ruby@v1
|
12
12
|
with:
|
13
|
-
ruby-version:
|
13
|
+
ruby-version: 2.7
|
14
14
|
bundler-cache: true
|
15
15
|
- name: Build the gem
|
16
16
|
run: bundle exec rake build
|
data/.github/workflows/rspec.yml
CHANGED
@@ -3,16 +3,16 @@ name: RSpec
|
|
3
3
|
on: [ push, pull_request ]
|
4
4
|
|
5
5
|
jobs:
|
6
|
-
|
6
|
+
rspec:
|
7
7
|
runs-on: ubuntu-latest
|
8
8
|
strategy:
|
9
9
|
matrix:
|
10
|
-
# Maintained versions:
|
11
|
-
# Security updates only: 2.
|
10
|
+
# Maintained versions: 3.0, 3.1
|
11
|
+
# Security updates only: 2.7 (expected EOL: 2023-03-31)
|
12
12
|
# Source: https://www.ruby-lang.org/en/downloads/branches/
|
13
|
-
ruby: [ 2.
|
13
|
+
ruby: [ 2.7, 3.0, 3.1 ]
|
14
14
|
steps:
|
15
|
-
- uses: actions/checkout@
|
15
|
+
- uses: actions/checkout@v3
|
16
16
|
- name: Set up Ruby
|
17
17
|
uses: ruby/setup-ruby@v1
|
18
18
|
with:
|
@@ -6,11 +6,11 @@ jobs:
|
|
6
6
|
runs-on: ubuntu-latest
|
7
7
|
steps:
|
8
8
|
- name: Check out code
|
9
|
-
uses: actions/checkout@
|
9
|
+
uses: actions/checkout@v3
|
10
10
|
- name: Install Ruby & Gems
|
11
|
-
uses: ruby/setup-ruby@v1
|
11
|
+
uses: ruby/setup-ruby@v1
|
12
12
|
with:
|
13
|
-
ruby-version:
|
13
|
+
ruby-version: 2.7
|
14
14
|
bundler-cache: true
|
15
15
|
- name: Rubocop
|
16
16
|
# https://github.com/reviewdog/action-rubocop
|
@@ -21,6 +21,7 @@ jobs:
|
|
21
21
|
github_token: ${{ secrets.github_token }}
|
22
22
|
reporter: github-pr-review
|
23
23
|
rubocop_version: gemfile
|
24
|
+
rubocop_flags: --display-only-fail-level-offenses --safe
|
24
25
|
# Rely on Bundler-installed gems so don't install them again
|
25
26
|
use_bundler: true
|
26
27
|
skip_install: true
|
data/.rubocop.yml
CHANGED
@@ -18,6 +18,7 @@ Lint/Void:
|
|
18
18
|
|
19
19
|
# RSpec has a lot of blocks, so ignore this rule there
|
20
20
|
Metrics/BlockLength:
|
21
|
+
Severity: info
|
21
22
|
Exclude:
|
22
23
|
- 'spec/**/*_spec.rb'
|
23
24
|
|
@@ -27,10 +28,12 @@ RSpec/DescribedClass:
|
|
27
28
|
|
28
29
|
# I prefer groups for structure, so the defaults are a little too strict for me
|
29
30
|
RSpec/NestedGroups:
|
31
|
+
Severity: info
|
30
32
|
Max: 4
|
31
33
|
|
32
34
|
# I prefer more verbose examples, so tend to use more lines than the defaults
|
33
35
|
RSpec/ExampleLength:
|
36
|
+
Severity: info
|
34
37
|
Max: 20
|
35
38
|
|
36
39
|
# For strings I enjoy using %w[], but for symbols the %i[] syntax just does not click.
|
@@ -50,4 +53,4 @@ Layout/FirstHashElementIndentation:
|
|
50
53
|
EnforcedStyle: consistent # default is special_inside_parentheses
|
51
54
|
# Let's enforce this to be consistent
|
52
55
|
Layout/EndOfLine:
|
53
|
-
EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
|
56
|
+
EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
## Unreleased changes
|
2
2
|
|
3
|
+
## Version 0.6.0 -- 2022-05-19
|
4
|
+
|
5
|
+
This release bumps the minimum Ruby version, so it's technically breaking if you're using a currently unsupported Ruby version. It also improves the tokenization of URLs by using their structure and common token boundaries to easily extract tokens.
|
6
|
+
|
7
|
+
- Breaking: drop support for Ruby 2.6, minimum is 2.7 ([#58](https://github.com/Narnach/groupie/pull/58))
|
8
|
+
- Feat: add better tokenization support for URIs ([#42](https://github.com/Narnach/groupie/pull/42), [#44](https://github.com/Narnach/groupie/pull/44))
|
9
|
+
- Dev: Rubocop ignores unsafe cops, hides info severity ([#59](https://github.com/Narnach/groupie/pull/59))
|
10
|
+
- Dev: enforce 100% test coverage ([#60](https://github.com/Narnach/groupie/pull/60))
|
11
|
+
|
3
12
|
## Version 0.5.0 -- 2022-02-16
|
4
13
|
|
5
14
|
This release has breaking changes (deprecation cleanup and internals rework), a new feature (smart weights!) and is officially tested on Ruby 3.1.0 (it's what I use). I've enabled the setting to require MFA to publish this gem, to help protect those who use it.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
groupie (0.
|
4
|
+
groupie (0.6.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
10
|
diff-lcs (1.5.0)
|
11
|
-
|
12
|
-
|
11
|
+
docile (1.4.0)
|
12
|
+
parallel (1.22.1)
|
13
|
+
parser (3.1.2.0)
|
13
14
|
ast (~> 2.4.1)
|
14
|
-
psych (4.0.
|
15
|
+
psych (4.0.4)
|
15
16
|
stringio
|
16
17
|
rainbow (3.1.1)
|
17
18
|
rake (13.0.6)
|
18
|
-
regexp_parser (2.
|
19
|
+
regexp_parser (2.4.0)
|
19
20
|
rexml (3.2.5)
|
20
21
|
rspec (3.11.0)
|
21
22
|
rspec-core (~> 3.11.0)
|
@@ -26,30 +27,36 @@ GEM
|
|
26
27
|
rspec-expectations (3.11.0)
|
27
28
|
diff-lcs (>= 1.2.0, < 2.0)
|
28
29
|
rspec-support (~> 3.11.0)
|
29
|
-
rspec-mocks (3.11.
|
30
|
+
rspec-mocks (3.11.1)
|
30
31
|
diff-lcs (>= 1.2.0, < 2.0)
|
31
32
|
rspec-support (~> 3.11.0)
|
32
33
|
rspec-support (3.11.0)
|
33
|
-
rubocop (1.
|
34
|
+
rubocop (1.29.1)
|
34
35
|
parallel (~> 1.10)
|
35
36
|
parser (>= 3.1.0.0)
|
36
37
|
rainbow (>= 2.2.2, < 4.0)
|
37
38
|
regexp_parser (>= 1.8, < 3.0)
|
38
|
-
rexml
|
39
|
-
rubocop-ast (>= 1.
|
39
|
+
rexml (>= 3.2.5, < 4.0)
|
40
|
+
rubocop-ast (>= 1.17.0, < 2.0)
|
40
41
|
ruby-progressbar (~> 1.7)
|
41
42
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
43
|
-
parser (>= 3.
|
44
|
-
rubocop-performance (1.13.
|
43
|
+
rubocop-ast (1.18.0)
|
44
|
+
parser (>= 3.1.1.0)
|
45
|
+
rubocop-performance (1.13.3)
|
45
46
|
rubocop (>= 1.7.0, < 2.0)
|
46
47
|
rubocop-ast (>= 0.4.0)
|
47
48
|
rubocop-rake (0.6.0)
|
48
49
|
rubocop (~> 1.0)
|
49
|
-
rubocop-rspec (2.
|
50
|
+
rubocop-rspec (2.10.0)
|
50
51
|
rubocop (~> 1.19)
|
51
52
|
ruby-progressbar (1.11.0)
|
52
|
-
|
53
|
+
simplecov (0.21.2)
|
54
|
+
docile (~> 1.1)
|
55
|
+
simplecov-html (~> 0.11)
|
56
|
+
simplecov_json_formatter (~> 0.1)
|
57
|
+
simplecov-html (0.12.3)
|
58
|
+
simplecov_json_formatter (0.1.4)
|
59
|
+
stringio (3.0.2)
|
53
60
|
unicode-display_width (2.1.0)
|
54
61
|
|
55
62
|
PLATFORMS
|
@@ -66,6 +73,7 @@ DEPENDENCIES
|
|
66
73
|
rubocop-performance (~> 1.11)
|
67
74
|
rubocop-rake (~> 0.6.0)
|
68
75
|
rubocop-rspec (~> 2.4)
|
76
|
+
simplecov (~> 0.21.2)
|
69
77
|
|
70
78
|
BUNDLED WITH
|
71
|
-
2.3.
|
79
|
+
2.3.14
|
data/bin/rubocop
CHANGED
@@ -1,2 +1,15 @@
|
|
1
1
|
#!/bin/bash
|
2
|
-
|
2
|
+
# Run Rubocop with sane defaults
|
3
|
+
# - only recognized file types: allows you to pipe in anything and have Ruboop only check files it thinks it can handle
|
4
|
+
# - force exclusion: even when piping in files, it still ignores what I should not touch
|
5
|
+
# - display style guide: help figure out what a rule is supposed to do
|
6
|
+
# - safe: don't run experimental cops
|
7
|
+
# - display only fail level offenses: hides INFO severity failing cops
|
8
|
+
|
9
|
+
bundle exec rubocop \
|
10
|
+
--only-recognized-file-types \
|
11
|
+
--force-exclusion \
|
12
|
+
--display-style-guide \
|
13
|
+
--safe \
|
14
|
+
--display-only-fail-level-offenses \
|
15
|
+
$*
|
data/groupie.gemspec
CHANGED
@@ -13,7 +13,9 @@ Gem::Specification.new do |spec|
|
|
13
13
|
' of one of the defined groups. Think of bayesian spam filters.'
|
14
14
|
spec.homepage = 'https://github.com/Narnach/groupie'
|
15
15
|
spec.license = 'MIT'
|
16
|
-
|
16
|
+
# Ruby maintains support for the last 3-4 minor versions, so that's what we do as well.
|
17
|
+
# See: https://www.ruby-lang.org/en/downloads/branches/
|
18
|
+
spec.required_ruby_version = '>= 2.7.0'
|
17
19
|
|
18
20
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
21
|
spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
class Groupie
|
6
|
+
# Tokenizer helps turn a String into an Array of Strings that are the
|
7
|
+
# individual tokens (mostly words) from the input.
|
8
|
+
#
|
9
|
+
# Please consider this entire class to be a private API,
|
10
|
+
# and use Groupie.tokenize to tokenize things.
|
11
|
+
class Tokenizer
|
12
|
+
def initialize(input)
|
13
|
+
# Ensure our input is converted to a String and duplicated so we can modify it in-place
|
14
|
+
@raw = input.to_s.dup
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_tokens
|
18
|
+
return @tokens if @tokens
|
19
|
+
|
20
|
+
# In-place modifications to our @raw String
|
21
|
+
downcase!
|
22
|
+
normalize_whitespace!
|
23
|
+
strip_html_tags!
|
24
|
+
tokenize_urls!
|
25
|
+
strip_non_word_characters!
|
26
|
+
|
27
|
+
# Split the resulting string on whitespace and clean up the token candidates
|
28
|
+
@tokens = @raw.split.map { |str| remove_interpunction!(str) }
|
29
|
+
|
30
|
+
@tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# Ignore case by downcasing everything
|
36
|
+
def downcase!
|
37
|
+
@raw.downcase!
|
38
|
+
end
|
39
|
+
|
40
|
+
# Convert all types of whitespace (space, tab, newline) into regular spaces
|
41
|
+
def normalize_whitespace!
|
42
|
+
@raw.gsub!(/\s+/, ' ')
|
43
|
+
end
|
44
|
+
|
45
|
+
# Strip HTML tags entirely
|
46
|
+
def strip_html_tags!
|
47
|
+
@raw.gsub!(/<[^>]+?>/, ' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
# Intelligently split URLs into their component parts
|
51
|
+
def tokenize_urls!
|
52
|
+
@raw.gsub!(%r{http[\w\-\#:/_.?&=]+}) do |url|
|
53
|
+
maybe_parse_url(url) do |uri|
|
54
|
+
path = uri.path.tap { |str| str.tr!('/_\-', ' ') }
|
55
|
+
query = uri.query.tap { |str| str&.tr!('?=&#_\-', ' ') }
|
56
|
+
fragment = uri.fragment.tap { |str| str&.tr!('#_/\-', ' ') }
|
57
|
+
|
58
|
+
"#{uri.scheme} #{uri.host} #{path} #{query} #{fragment}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Strip characters not likely to be part of a word or number
|
64
|
+
def strip_non_word_characters!
|
65
|
+
@raw.gsub!(/[^\w\ \-.,]/, ' ')
|
66
|
+
end
|
67
|
+
|
68
|
+
# Remove wrapping quotes and interpunction from individual token candidates
|
69
|
+
def remove_interpunction!(str)
|
70
|
+
str.gsub!(/\A['"]+|[!,."']+\Z/, '')
|
71
|
+
str
|
72
|
+
end
|
73
|
+
|
74
|
+
# Sometimes a String looks like a URL, but it's not.
|
75
|
+
# This method attempts to parse the input string into a URI.
|
76
|
+
# If it's successful, yield it to the block and return its response.
|
77
|
+
# In case of failure, return the original string.
|
78
|
+
def maybe_parse_url(input)
|
79
|
+
uri = URI.parse(input)
|
80
|
+
yield uri
|
81
|
+
rescue URI::InvalidURIError
|
82
|
+
input
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/lib/groupie/version.rb
CHANGED
data/lib/groupie.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'groupie/version'
|
4
4
|
require_relative 'groupie/group'
|
5
|
+
require_relative 'groupie/tokenizer'
|
5
6
|
require 'set'
|
6
7
|
|
7
8
|
# Groupie is a text grouper and classifier, using naive Bayesian filtering.
|
@@ -24,13 +25,7 @@ class Groupie
|
|
24
25
|
# @param [String, #to_s] object
|
25
26
|
# @return [Array<String>]
|
26
27
|
def self.tokenize(object)
|
27
|
-
object
|
28
|
-
.to_s
|
29
|
-
.downcase
|
30
|
-
.gsub(/\s/, ' ')
|
31
|
-
.gsub(/[$']/, '')
|
32
|
-
.gsub(/<[^>]+?>|[^\w -.,]/, '')
|
33
|
-
.split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
|
28
|
+
Tokenizer.new(object).to_tokens
|
34
29
|
end
|
35
30
|
|
36
31
|
# Access an existing Group or create a new one.
|
@@ -52,7 +47,7 @@ class Groupie
|
|
52
47
|
group_score_sums, hits = calculate_group_scores(words, strategy)
|
53
48
|
|
54
49
|
group_score_sums.each.with_object({}) do |(group, sum), averages|
|
55
|
-
averages[group] =
|
50
|
+
averages[group] = sum / hits
|
56
51
|
end
|
57
52
|
end
|
58
53
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: groupie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wes Oldenbeuving
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Groupie is a simple way to group texts and classify new texts as being
|
14
14
|
a likely member of one of the defined groups. Think of bayesian spam filters.
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- groupie.gemspec
|
41
41
|
- lib/groupie.rb
|
42
42
|
- lib/groupie/group.rb
|
43
|
+
- lib/groupie/tokenizer.rb
|
43
44
|
- lib/groupie/version.rb
|
44
45
|
homepage: https://github.com/Narnach/groupie
|
45
46
|
licenses:
|
@@ -57,14 +58,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
57
58
|
requirements:
|
58
59
|
- - ">="
|
59
60
|
- !ruby/object:Gem::Version
|
60
|
-
version: 2.
|
61
|
+
version: 2.7.0
|
61
62
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
63
|
requirements:
|
63
64
|
- - ">="
|
64
65
|
- !ruby/object:Gem::Version
|
65
66
|
version: '0'
|
66
67
|
requirements: []
|
67
|
-
rubygems_version: 3.3.
|
68
|
+
rubygems_version: 3.3.14
|
68
69
|
signing_key:
|
69
70
|
specification_version: 4
|
70
71
|
summary: Library to help you group texts and classify new ones
|