groupie 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,53 +1,12 @@
1
- require 'rubygems'
2
- require 'rake'
1
+ # frozen_string_literal: true
3
2
 
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "groupie"
8
- gem.summary = %Q{Group and classify text}
9
- gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
10
- gem.email = "narnach@gmail.com"
11
- gem.homepage = "http://github.com/Narnach/groupie"
12
- gem.authors = ["Wes Oldenbeuving"]
13
- gem.add_development_dependency "testy", ">= 0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- end
16
- Jeweler::GemcutterTasks.new
17
- rescue LoadError
18
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
- end
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
20
5
 
21
- require 'rake/testtask'
22
- Rake::TestTask.new(:spec) do |test|
23
- test.libs << 'lib' << 'spec'
24
- test.pattern = 'spec/**/*_spec.rb'
25
- test.verbose = true
26
- end
6
+ RSpec::Core::RakeTask.new(:spec)
27
7
 
28
- begin
29
- require 'rcov/rcovtask'
30
- Rcov::RcovTask.new do |test|
31
- test.libs << 'spec'
32
- test.pattern = 'spec/**/*_spec.rb'
33
- test.verbose = true
34
- end
35
- rescue LoadError
36
- task :rcov do
37
- abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
- end
39
- end
8
+ require 'rubocop/rake_task'
40
9
 
41
- task :test => :check_dependencies
10
+ RuboCop::RakeTask.new
42
11
 
43
- task :default => :test
44
-
45
- require 'rake/rdoctask'
46
- Rake::RDocTask.new do |rdoc|
47
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
-
49
- rdoc.rdoc_dir = 'rdoc'
50
- rdoc.title = "groupie #{version}"
51
- rdoc.rdoc_files.include('readme*')
52
- rdoc.rdoc_files.include('lib/**/*.rb')
53
- end
12
+ task default: [:spec, :rubocop]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'groupie'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/rubocop ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ bundle exec rubocop --force-exclusion $*
data/bin/setup ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -x
5
+
6
+ bundle install
7
+ bundle clean
8
+
9
+ # Do any other automated setup that you need to do here
data/groupie.gemspec ADDED
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/groupie/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'groupie'
7
+ spec.version = Groupie::VERSION
8
+ spec.authors = ['Wes Oldenbeuving']
9
+ spec.email = ['wes@narnach.com']
10
+
11
+ spec.summary = 'Library to help you group texts and classify new ones'
12
+ spec.description = 'Groupie is a simple way to group texts and classify new texts as being a likely member' \
13
+ ' of one of the defined groups. Think of bayesian spam filters.'
14
+ spec.homepage = 'https://github.com/Narnach/groupie'
15
+ spec.license = 'MIT'
16
+ spec.required_ruby_version = '>= 2.6.0' # EOL for 2.6 is 2022-03-31, so support this as the minimum for now
17
+
18
+ spec.metadata['homepage_uri'] = spec.homepage
19
+ spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
20
+ spec.metadata['changelog_uri'] = 'https://github.com/Narnach/groupie/blob/stable/Changelog.md'
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
26
+ end
27
+ # spec.bindir = "exe"
28
+ # spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ # Uncomment to register a new dependency of your gem
32
+ # spec.add_dependency "example-gem", "~> 1.0"
33
+
34
+ # For more information and examples about making a new gem, checkout our
35
+ # guide at: https://bundler.io/guides/creating_gem.html
36
+ end
@@ -1,12 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
2
4
  module CoreExt
5
+ # This module monkey patches String to respond to #tokenize
3
6
  module String
4
7
  def tokenize
5
- downcase.
6
- gsub(/\s/," ").
7
- gsub(/[$']/,'').
8
- gsub(/<[^>]+?>|[^\w -.,]/,'').
9
- split(" ").map {|str| str.gsub(/\A['"]+|[!,."']+\Z/,'')}
8
+ warn "Please use Groupie.tokenize instead of String#tokenize (from #{caller(1..1).first})"
9
+ Groupie.tokenize(self)
10
10
  end
11
11
  end
12
12
  end
@@ -14,4 +14,4 @@ end
14
14
 
15
15
  class String
16
16
  include Groupie::CoreExt::String
17
- end
17
+ end
data/lib/groupie/group.rb CHANGED
@@ -1,6 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
4
+ # Group represents a group or category that words can be classified into.
2
5
  class Group
3
6
  attr_reader :word_counts
7
+
4
8
  def initialize(name)
5
9
  @name = name
6
10
  @word_counts = {}
@@ -17,6 +21,7 @@ class Groupie
17
21
  end
18
22
  nil
19
23
  end
24
+
20
25
  alias << add
21
26
 
22
27
  # Return the count for a specific +word+.
@@ -24,11 +29,12 @@ class Groupie
24
29
  @word_counts[word] || 0
25
30
  end
26
31
 
32
+ private
33
+
27
34
  # Add a single word and count it.
28
35
  def add_word(word)
29
36
  @word_counts[word] ||= 0
30
37
  @word_counts[word] += 1
31
38
  end
32
- private :add_word
33
39
  end
34
- end
40
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This extends Groupie and adds a version number
4
+ class Groupie
5
+ VERSION = '0.4.0'
6
+
7
+ def self.version
8
+ VERSION
9
+ end
10
+ end
data/lib/groupie.rb CHANGED
@@ -1,86 +1,133 @@
1
- lib_dir = File.expand_path(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
- require 'groupie/group'
4
- require 'groupie/core_ext/string'
1
+ # frozen_string_literal: true
5
2
 
3
+ require_relative 'groupie/version'
4
+ require_relative 'groupie/group'
5
+ require_relative 'groupie/core_ext/string'
6
+
7
+ # Groupie is a text grouper and classifier, using naive Bayesian filtering.
6
8
  class Groupie
9
+ # Wrap all errors we raise in this so our own errors are recognizable.
10
+ class Error < StandardError; end
11
+
7
12
  def initialize
8
13
  @groups = {}
9
14
  end
10
15
 
16
+ # Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
17
+ # This attempts to remove most common punctuation marks and types of whitespace.
18
+ #
19
+ # @param [String, #to_s] object
20
+ # @return [Array<String>]
21
+ def self.tokenize(object)
22
+ object
23
+ .to_s
24
+ .downcase
25
+ .gsub(/\s/, ' ')
26
+ .gsub(/[$']/, '')
27
+ .gsub(/<[^>]+?>|[^\w -.,]/, '')
28
+ .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
29
+ end
30
+
31
+ # Access an existing Group or create a new one.
32
+ #
33
+ # @param [Object] group The name of the group to access.
34
+ # @return [Groupie::Group] An existing or new group identified by +group+.
11
35
  def [](group)
12
36
  @groups[group] ||= Group.new(group)
13
37
  end
14
38
 
15
- def unique_words
16
- @unique_words ||= (
17
- total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
18
- median_index = [total_count.values.size * 3 / 4 - 1, 1].max
19
- median_frequency = total_count.values.sort[median_index]
20
- total_count.select{|word, count| count <= median_frequency}.map(&:first)
21
- )
39
+ # Classify a text by taking the average of all word classifications.
40
+ #
41
+ # @param [Array<String>] words List of words to be classified
42
+ # @param [Symbol] strategy
43
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
44
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
45
+ def classify_text(words, strategy = :sum)
46
+ words &= unique_words if strategy == :unique
47
+ group_score_sums, hits = calculate_group_scores(words, strategy)
48
+
49
+ group_score_sums.each.with_object({}) do |(group, sum), averages|
50
+ averages[group] = hits.positive? ? sum / hits : 0
51
+ end
22
52
  end
23
53
 
24
- def classify(entry, strategy=:sum)
54
+ # Classify a single word against all groups.
55
+ #
56
+ # @param [String] entry A word to be classified
57
+ # @param [Symbol] strategy
58
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
59
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
60
+ def classify(entry, strategy = :sum)
25
61
  results = {}
26
- total_count = @groups.inject(0) do |sum, name_group|
27
- group = name_group.last
28
- count = group.count(entry)
29
- case strategy
30
- when :sum
31
- sum += count
32
- when :sqrt, :unique
33
- sum += Math::sqrt(count)
34
- when :log
35
- sum += Math::log10(count) if count > 0
36
- else
37
- raise "Invalid strategy: #{strategy}"
38
- end
39
- next sum
62
+ total_count = @groups.values.inject(0) do |sum, group|
63
+ sum + apply_count_strategy(group.count(entry), strategy)
40
64
  end
41
- return results if 0 == total_count
65
+ return results if total_count.zero?
42
66
 
43
67
  @groups.each do |name, group|
44
- count = group.count(entry)
45
- case strategy
46
- when :sum
47
- # keep count
48
- when :sqrt, :unique
49
- count = Math::sqrt(count)
50
- when :log
51
- count = Math::log10(count) if count > 0
52
- else
53
- raise "Invalid strategy: #{strategy}"
54
- end
55
- results[name] = count > 0 ? count.to_f / total_count : 0.0
68
+ count = apply_count_strategy(group.count(entry), strategy)
69
+ results[name] = count.positive? ? count.to_f / total_count : 0.0
56
70
  end
57
- return results
71
+
72
+ results
58
73
  end
59
74
 
60
- # Classify a text by taking the average of all word classifications.
61
- def classify_text(words, strategy=:sum)
62
- hits = 0
63
- if strategy==:unique
64
- words = words & unique_words
75
+ # Return a word score dictionary that excludes the 4th quartile most popular words.
76
+ # Why do this? So the most common (and thus meaningless) words are ignored
77
+ # and less common words gain more predictive power.
78
+ #
79
+ # This is used by the :unique strategy of the classifier.
80
+ #
81
+ # @return [Hash<String, Integer>]
82
+ def unique_words
83
+ # Iterate over all Groups and merge their <word, count> dictionaries into one
84
+ total_count = @groups.inject({}) do |total, (_name, group)|
85
+ total.merge!(group.word_counts) { |_key, o, n| o + n }
65
86
  end
66
- group_score_sums = words.inject({}) do |results, word|
87
+ # Extract the word count that's at the top 75%
88
+ top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
89
+ top_quartile_frequency = total_count.values.sort[top_quartile_index]
90
+ # Throw out all words which have a count that's above this frequency
91
+ total_count.reject! { |_word, count| count > top_quartile_frequency }
92
+ total_count.keys
93
+ end
94
+
95
+ private
96
+
97
+ # Calculate grouped scores
98
+ #
99
+ # @param [Array<String>] words
100
+ # @param [Symbol] strategy
101
+ # @return [Array<Enumerator<String>, Integer>] a Hash with <group, score> pairs and an integer with the number of hits
102
+ def calculate_group_scores(words, strategy)
103
+ hits = 0
104
+ group_score_sums = words.each.with_object({}) do |word, results|
67
105
  word_results = classify(word, strategy)
68
106
  next results if word_results.empty?
69
- hits += 1
70
- results.merge(word_results) do |key, old, new|
71
- old + new
72
- end
73
- end
74
107
 
75
- averages={}
76
- group_score_sums.each do |group, sum|
77
- averages[group] = hits > 0 ? sum / hits : 0
108
+ hits += 1
109
+ results.merge!(word_results) { |_key, old, new| old + new }
78
110
  end
79
111
 
80
- averages
112
+ [group_score_sums, hits]
81
113
  end
82
114
 
83
- def self.version
84
- File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
115
+ # Helper function to reduce a raw word count to a strategy-modified weight.
116
+ # @param [Integer] count
117
+ # @param [Symbol] strategy
118
+ # @return [Integer, Float]
119
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
120
+ def apply_count_strategy(count, strategy)
121
+ case strategy
122
+ when :sum
123
+ # keep count
124
+ when :sqrt, :unique
125
+ count = Math.sqrt(count)
126
+ when :log
127
+ count = Math.log10(count) if count.positive?
128
+ else
129
+ raise Error, "Invalid strategy: #{strategy}"
130
+ end
131
+ count
85
132
  end
86
- end
133
+ end
metadata CHANGED
@@ -1,100 +1,68 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
- version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 3
9
- - 0
10
- version: 0.3.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Wes Oldenbeuving
14
- autorequire:
8
+ autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2010-07-29 00:00:00 +02:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- name: testy
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
- version: "0"
33
- type: :development
34
- version_requirements: *id001
35
- description: Group and classify text based on likelyhood of being included in a text of a specific category
36
- email: narnach@gmail.com
11
+ date: 2021-09-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Groupie is a simple way to group texts and classify new texts as being
14
+ a likely member of one of the defined groups. Think of bayesian spam filters.
15
+ email:
16
+ - wes@narnach.com
37
17
  executables: []
38
-
39
18
  extensions: []
40
-
41
- extra_rdoc_files:
42
- - LICENSE
43
- files:
44
- - .document
45
- - .gitignore
46
- - LICENSE
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".github/dependabot.yml"
22
+ - ".github/workflows/gem.yml"
23
+ - ".github/workflows/rspec.yml"
24
+ - ".github/workflows/rubocop.yml"
25
+ - ".gitignore"
26
+ - ".rspec"
27
+ - ".rubocop.yml"
28
+ - CHANGELOG.md
29
+ - Gemfile
30
+ - Gemfile.lock
31
+ - LICENSE.txt
32
+ - README.md
47
33
  - Rakefile
48
- - VERSION
34
+ - bin/console
35
+ - bin/rubocop
36
+ - bin/setup
37
+ - groupie.gemspec
49
38
  - lib/groupie.rb
50
39
  - lib/groupie/core_ext/string.rb
51
40
  - lib/groupie/group.rb
52
- - readme.rdoc
53
- - spec/fixtures/ham/email_ham1.txt
54
- - spec/fixtures/ham/spam.la-44116217.txt
55
- - spec/fixtures/spam/email_spam1.txt
56
- - spec/fixtures/spam/email_spam2.txt
57
- - spec/fixtures/spam/spam.la-44118014.txt
58
- - spec/groupie/core_ext/string_spec.rb
59
- - spec/groupie/group_spec.rb
60
- - spec/groupie_spec.rb
61
- - spec/spec_helper.rb
62
- has_rdoc: true
63
- homepage: http://github.com/Narnach/groupie
64
- licenses: []
65
-
66
- post_install_message:
67
- rdoc_options:
68
- - --charset=UTF-8
69
- require_paths:
41
+ - lib/groupie/version.rb
42
+ homepage: https://github.com/Narnach/groupie
43
+ licenses:
44
+ - MIT
45
+ metadata:
46
+ homepage_uri: https://github.com/Narnach/groupie
47
+ source_code_uri: https://github.com/Narnach/groupie
48
+ changelog_uri: https://github.com/Narnach/groupie/blob/stable/Changelog.md
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
70
52
  - lib
71
- required_ruby_version: !ruby/object:Gem::Requirement
72
- none: false
73
- requirements:
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
74
55
  - - ">="
75
- - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
- version: "0"
80
- required_rubygems_version: !ruby/object:Gem::Requirement
81
- none: false
82
- requirements:
56
+ - !ruby/object:Gem::Version
57
+ version: 2.6.0
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
83
60
  - - ">="
84
- - !ruby/object:Gem::Version
85
- hash: 3
86
- segments:
87
- - 0
88
- version: "0"
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
89
63
  requirements: []
90
-
91
- rubyforge_project:
92
- rubygems_version: 1.3.7
93
- signing_key:
94
- specification_version: 3
95
- summary: Group and classify text
96
- test_files:
97
- - spec/groupie/core_ext/string_spec.rb
98
- - spec/groupie/group_spec.rb
99
- - spec/groupie_spec.rb
100
- - spec/spec_helper.rb
64
+ rubygems_version: 3.2.24
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: Library to help you group texts and classify new ones
68
+ test_files: []