groupie 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd7e47aef7d4ed19c206e46d5eb716991562ee2c257c5cf8488341f646801f2f
4
- data.tar.gz: 692108e3c8c2b7d4b26a3c7d702133d0a5e01324d0c42cd9c4143b3a4b601b6c
3
+ metadata.gz: 0afe90008ac3d4c4defc95c8317349195f6e13a9190f0a41ec47aed1bdd31f2b
4
+ data.tar.gz: ad64128eaf725f8a8f22c86456fc3aec97a331e975578da8fa58082f983ed839
5
5
  SHA512:
6
- metadata.gz: dba999961a8c6d7ba2999770d9125dc1c4b2c6468452c115a0eeb3256d8d484ef0cba8d975053e5e4e7d907afaf402063fb0b75c6d1c24b5bd3dc7698a251f35
7
- data.tar.gz: cb5d99d029d237a37354b81b97a4d0a03b6224040ddbbbd0cd96cb3536ea0cfe4ecfb55871e35749b133df0178c97937f2b98523dc3504e4620b3a2473403478
6
+ metadata.gz: 12ef28e989b25de4d85e6eb4f5684907b1934b9ca83aa4efdafecdeffe2e27a9c8f33ff3ebd5a4a3c2efc6a0eeaa4d5d33480f481d5608d3fa586bba415377a5
7
+ data.tar.gz: bf563abf04440751e5c81a7b738313a19f1f7e8feb32fe9263f86635e5fa57c277e29ffbfa0a8bd195f1a3dfc74b1d4aab0460fa04746643162f9451b33fa275
@@ -6,11 +6,11 @@ jobs:
6
6
  build:
7
7
  runs-on: ubuntu-latest
8
8
  steps:
9
- - uses: actions/checkout@v2
9
+ - uses: actions/checkout@v3
10
10
  - name: Set up Ruby
11
11
  uses: ruby/setup-ruby@v1
12
12
  with:
13
- ruby-version: 3.0
13
+ ruby-version: 2.7
14
14
  bundler-cache: true
15
15
  - name: Build the gem
16
16
  run: bundle exec rake build
@@ -3,16 +3,16 @@ name: RSpec
3
3
  on: [ push, pull_request ]
4
4
 
5
5
  jobs:
6
- build:
6
+ rspec:
7
7
  runs-on: ubuntu-latest
8
8
  strategy:
9
9
  matrix:
10
- # Maintained versions: 2.7, 3.0, 3.1
11
- # Security updates only: 2.6 (EOL: 2022-03-31)
10
+ # Maintained versions: 3.0, 3.1
11
+ # Security updates only: 2.7 (expected EOL: 2023-03-31)
12
12
  # Source: https://www.ruby-lang.org/en/downloads/branches/
13
- ruby: [ 2.6, 2.7, 3.0, 3.1 ]
13
+ ruby: [ 2.7, 3.0, 3.1 ]
14
14
  steps:
15
- - uses: actions/checkout@v2
15
+ - uses: actions/checkout@v3
16
16
  - name: Set up Ruby
17
17
  uses: ruby/setup-ruby@v1
18
18
  with:
@@ -6,11 +6,11 @@ jobs:
6
6
  runs-on: ubuntu-latest
7
7
  steps:
8
8
  - name: Check out code
9
- uses: actions/checkout@v2
9
+ uses: actions/checkout@v3
10
10
  - name: Install Ruby & Gems
11
- uses: ruby/setup-ruby@v1 # Uses .ruby-version as version input
11
+ uses: ruby/setup-ruby@v1
12
12
  with:
13
- ruby-version: 3.0
13
+ ruby-version: 2.7
14
14
  bundler-cache: true
15
15
  - name: Rubocop
16
16
  # https://github.com/reviewdog/action-rubocop
@@ -21,6 +21,7 @@ jobs:
21
21
  github_token: ${{ secrets.github_token }}
22
22
  reporter: github-pr-review
23
23
  rubocop_version: gemfile
24
+ rubocop_flags: --display-only-fail-level-offenses --safe
24
25
  # Rely on Bundler-installed gems so don't install them again
25
26
  use_bundler: true
26
27
  skip_install: true
data/.rubocop.yml CHANGED
@@ -18,6 +18,7 @@ Lint/Void:
18
18
 
19
19
  # RSpec has a lot of blocks, so ignore this rule there
20
20
  Metrics/BlockLength:
21
+ Severity: info
21
22
  Exclude:
22
23
  - 'spec/**/*_spec.rb'
23
24
 
@@ -27,10 +28,12 @@ RSpec/DescribedClass:
27
28
 
28
29
  # I prefer groups for structure, so the defaults are a little too strict for me
29
30
  RSpec/NestedGroups:
31
+ Severity: info
30
32
  Max: 4
31
33
 
32
34
  # I prefer more verbose examples, so tend to use more lines than the defaults
33
35
  RSpec/ExampleLength:
36
+ Severity: info
34
37
  Max: 20
35
38
 
36
39
  # For strings I enjoy using %w[], but for symbols the %i[] syntax just does not click.
@@ -50,4 +53,4 @@ Layout/FirstHashElementIndentation:
50
53
  EnforcedStyle: consistent # default is special_inside_parentheses
51
54
  # Let's enforce this to be consistent
52
55
  Layout/EndOfLine:
53
- EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
56
+ EnforcedStyle: lf # \n (unix line end) enforced everywhere, default is native
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  ## Unreleased changes
2
2
 
3
+ ## Version 0.6.0 -- 2022-05-19
4
+
5
+ This release bumps the minimum Ruby version, so it's technically breaking if you're using a currently unsupported Ruby version. It also improves the tokenization of URLs by using their structure and common token boundaries to easily extract tokens.
6
+
7
+ - Breaking: drop support for Ruby 2.6, minimum is 2.7 ([#58](https://github.com/Narnach/groupie/pull/58))
8
+ - Feat: add better tokenization support for URIs ([#42](https://github.com/Narnach/groupie/pull/42), [#44](https://github.com/Narnach/groupie/pull/44))
9
+ - Dev: Rubocop ignores unsafe cops, hides info severity ([#59](https://github.com/Narnach/groupie/pull/59))
10
+ - Dev: enforce 100% test coverage ([#60](https://github.com/Narnach/groupie/pull/60))
11
+
3
12
  ## Version 0.5.0 -- 2022-02-16
4
13
 
5
14
  This release has breaking changes (deprecation cleanup and internals rework), a new feature (smart weights!) and is officially tested on Ruby 3.1.0 (it's what I use). I've enabled the setting to require MFA to publish this gem, to help protect those who use it.
data/Gemfile CHANGED
@@ -12,3 +12,4 @@ gem 'rubocop', '~> 1.7'
12
12
  gem 'rubocop-performance', '~> 1.11'
13
13
  gem 'rubocop-rake', '~> 0.6.0'
14
14
  gem 'rubocop-rspec', '~> 2.4'
15
+ gem 'simplecov', '~> 0.21.2'
data/Gemfile.lock CHANGED
@@ -1,21 +1,22 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- groupie (0.5.0)
4
+ groupie (0.6.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
10
  diff-lcs (1.5.0)
11
- parallel (1.21.0)
12
- parser (3.1.0.0)
11
+ docile (1.4.0)
12
+ parallel (1.22.1)
13
+ parser (3.1.2.0)
13
14
  ast (~> 2.4.1)
14
- psych (4.0.3)
15
+ psych (4.0.4)
15
16
  stringio
16
17
  rainbow (3.1.1)
17
18
  rake (13.0.6)
18
- regexp_parser (2.2.1)
19
+ regexp_parser (2.4.0)
19
20
  rexml (3.2.5)
20
21
  rspec (3.11.0)
21
22
  rspec-core (~> 3.11.0)
@@ -26,30 +27,36 @@ GEM
26
27
  rspec-expectations (3.11.0)
27
28
  diff-lcs (>= 1.2.0, < 2.0)
28
29
  rspec-support (~> 3.11.0)
29
- rspec-mocks (3.11.0)
30
+ rspec-mocks (3.11.1)
30
31
  diff-lcs (>= 1.2.0, < 2.0)
31
32
  rspec-support (~> 3.11.0)
32
33
  rspec-support (3.11.0)
33
- rubocop (1.25.1)
34
+ rubocop (1.29.1)
34
35
  parallel (~> 1.10)
35
36
  parser (>= 3.1.0.0)
36
37
  rainbow (>= 2.2.2, < 4.0)
37
38
  regexp_parser (>= 1.8, < 3.0)
38
- rexml
39
- rubocop-ast (>= 1.15.1, < 2.0)
39
+ rexml (>= 3.2.5, < 4.0)
40
+ rubocop-ast (>= 1.17.0, < 2.0)
40
41
  ruby-progressbar (~> 1.7)
41
42
  unicode-display_width (>= 1.4.0, < 3.0)
42
- rubocop-ast (1.15.2)
43
- parser (>= 3.0.1.1)
44
- rubocop-performance (1.13.2)
43
+ rubocop-ast (1.18.0)
44
+ parser (>= 3.1.1.0)
45
+ rubocop-performance (1.13.3)
45
46
  rubocop (>= 1.7.0, < 2.0)
46
47
  rubocop-ast (>= 0.4.0)
47
48
  rubocop-rake (0.6.0)
48
49
  rubocop (~> 1.0)
49
- rubocop-rspec (2.8.0)
50
+ rubocop-rspec (2.10.0)
50
51
  rubocop (~> 1.19)
51
52
  ruby-progressbar (1.11.0)
52
- stringio (3.0.1)
53
+ simplecov (0.21.2)
54
+ docile (~> 1.1)
55
+ simplecov-html (~> 0.11)
56
+ simplecov_json_formatter (~> 0.1)
57
+ simplecov-html (0.12.3)
58
+ simplecov_json_formatter (0.1.4)
59
+ stringio (3.0.2)
53
60
  unicode-display_width (2.1.0)
54
61
 
55
62
  PLATFORMS
@@ -66,6 +73,7 @@ DEPENDENCIES
66
73
  rubocop-performance (~> 1.11)
67
74
  rubocop-rake (~> 0.6.0)
68
75
  rubocop-rspec (~> 2.4)
76
+ simplecov (~> 0.21.2)
69
77
 
70
78
  BUNDLED WITH
71
- 2.3.4
79
+ 2.3.14
data/bin/rubocop CHANGED
@@ -1,2 +1,15 @@
1
1
  #!/bin/bash
2
- bundle exec rubocop --force-exclusion $*
2
+ # Run Rubocop with sane defaults
3
+ # - only recognized file types: allows you to pipe in anything and have Ruboop only check files it thinks it can handle
4
+ # - force exclusion: even when piping in files, it still ignores what I should not touch
5
+ # - display style guide: help figure out what a rule is supposed to do
6
+ # - safe: don't run experimental cops
7
+ # - display only fail level offenses: hides INFO severity failing cops
8
+
9
+ bundle exec rubocop \
10
+ --only-recognized-file-types \
11
+ --force-exclusion \
12
+ --display-style-guide \
13
+ --safe \
14
+ --display-only-fail-level-offenses \
15
+ $*
data/groupie.gemspec CHANGED
@@ -13,7 +13,9 @@ Gem::Specification.new do |spec|
13
13
  ' of one of the defined groups. Think of bayesian spam filters.'
14
14
  spec.homepage = 'https://github.com/Narnach/groupie'
15
15
  spec.license = 'MIT'
16
- spec.required_ruby_version = '>= 2.6.0' # EOL for 2.6 is 2022-03-31, so support this as the minimum for now
16
+ # Ruby maintains support for the last 3-4 minor versions, so that's what we do as well.
17
+ # See: https://www.ruby-lang.org/en/downloads/branches/
18
+ spec.required_ruby_version = '>= 2.7.0'
17
19
 
18
20
  spec.metadata['homepage_uri'] = spec.homepage
19
21
  spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'uri'
4
+
5
+ class Groupie
6
+ # Tokenizer helps turn a String into an Array of Strings that are the
7
+ # individual tokens (mostly words) from the input.
8
+ #
9
+ # Please consider this entire class to be a private API,
10
+ # and use Groupie.tokenize to tokenize things.
11
+ class Tokenizer
12
+ def initialize(input)
13
+ # Ensure our input is converted to a String and duplicated so we can modify it in-place
14
+ @raw = input.to_s.dup
15
+ end
16
+
17
+ def to_tokens
18
+ return @tokens if @tokens
19
+
20
+ # In-place modifications to our @raw String
21
+ downcase!
22
+ normalize_whitespace!
23
+ strip_html_tags!
24
+ tokenize_urls!
25
+ strip_non_word_characters!
26
+
27
+ # Split the resulting string on whitespace and clean up the token candidates
28
+ @tokens = @raw.split.map { |str| remove_interpunction!(str) }
29
+
30
+ @tokens
31
+ end
32
+
33
+ private
34
+
35
+ # Ignore case by downcasing everything
36
+ def downcase!
37
+ @raw.downcase!
38
+ end
39
+
40
+ # Convert all types of whitespace (space, tab, newline) into regular spaces
41
+ def normalize_whitespace!
42
+ @raw.gsub!(/\s+/, ' ')
43
+ end
44
+
45
+ # Strip HTML tags entirely
46
+ def strip_html_tags!
47
+ @raw.gsub!(/<[^>]+?>/, ' ')
48
+ end
49
+
50
+ # Intelligently split URLs into their component parts
51
+ def tokenize_urls!
52
+ @raw.gsub!(%r{http[\w\-\#:/_.?&=]+}) do |url|
53
+ maybe_parse_url(url) do |uri|
54
+ path = uri.path.tap { |str| str.tr!('/_\-', ' ') }
55
+ query = uri.query.tap { |str| str&.tr!('?=&#_\-', ' ') }
56
+ fragment = uri.fragment.tap { |str| str&.tr!('#_/\-', ' ') }
57
+
58
+ "#{uri.scheme} #{uri.host} #{path} #{query} #{fragment}"
59
+ end
60
+ end
61
+ end
62
+
63
+ # Strip characters not likely to be part of a word or number
64
+ def strip_non_word_characters!
65
+ @raw.gsub!(/[^\w\ \-.,]/, ' ')
66
+ end
67
+
68
+ # Remove wrapping quotes and interpunction from individual token candidates
69
+ def remove_interpunction!(str)
70
+ str.gsub!(/\A['"]+|[!,."']+\Z/, '')
71
+ str
72
+ end
73
+
74
+ # Sometimes a String looks like a URL, but it's not.
75
+ # This method attempts to parse the input string into a URI.
76
+ # If it's successful, yield it to the block and return its response.
77
+ # In case of failure, return the original string.
78
+ def maybe_parse_url(input)
79
+ uri = URI.parse(input)
80
+ yield uri
81
+ rescue URI::InvalidURIError
82
+ input
83
+ end
84
+ end
85
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  # This extends Groupie and adds a version number
4
4
  class Groupie
5
- VERSION = '0.5.0'
5
+ VERSION = '0.6.0'
6
6
 
7
7
  def self.version
8
8
  VERSION
data/lib/groupie.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'groupie/version'
4
4
  require_relative 'groupie/group'
5
+ require_relative 'groupie/tokenizer'
5
6
  require 'set'
6
7
 
7
8
  # Groupie is a text grouper and classifier, using naive Bayesian filtering.
@@ -24,13 +25,7 @@ class Groupie
24
25
  # @param [String, #to_s] object
25
26
  # @return [Array<String>]
26
27
  def self.tokenize(object)
27
- object
28
- .to_s
29
- .downcase
30
- .gsub(/\s/, ' ')
31
- .gsub(/[$']/, '')
32
- .gsub(/<[^>]+?>|[^\w -.,]/, '')
33
- .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
28
+ Tokenizer.new(object).to_tokens
34
29
  end
35
30
 
36
31
  # Access an existing Group or create a new one.
@@ -52,7 +47,7 @@ class Groupie
52
47
  group_score_sums, hits = calculate_group_scores(words, strategy)
53
48
 
54
49
  group_score_sums.each.with_object({}) do |(group, sum), averages|
55
- averages[group] = hits.positive? ? sum / hits : 0
50
+ averages[group] = sum / hits
56
51
  end
57
52
  end
58
53
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Wes Oldenbeuving
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-02-16 00:00:00.000000000 Z
11
+ date: 2022-05-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Groupie is a simple way to group texts and classify new texts as being
14
14
  a likely member of one of the defined groups. Think of bayesian spam filters.
@@ -40,6 +40,7 @@ files:
40
40
  - groupie.gemspec
41
41
  - lib/groupie.rb
42
42
  - lib/groupie/group.rb
43
+ - lib/groupie/tokenizer.rb
43
44
  - lib/groupie/version.rb
44
45
  homepage: https://github.com/Narnach/groupie
45
46
  licenses:
@@ -57,14 +58,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
57
58
  requirements:
58
59
  - - ">="
59
60
  - !ruby/object:Gem::Version
60
- version: 2.6.0
61
+ version: 2.7.0
61
62
  required_rubygems_version: !ruby/object:Gem::Requirement
62
63
  requirements:
63
64
  - - ">="
64
65
  - !ruby/object:Gem::Version
65
66
  version: '0'
66
67
  requirements: []
67
- rubygems_version: 3.3.3
68
+ rubygems_version: 3.3.14
68
69
  signing_key:
69
70
  specification_version: 4
70
71
  summary: Library to help you group texts and classify new ones