groupie 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,53 +1,12 @@
1
- require 'rubygems'
2
- require 'rake'
1
+ # frozen_string_literal: true
3
2
 
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "groupie"
8
- gem.summary = %Q{Group and classify text}
9
- gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
10
- gem.email = "narnach@gmail.com"
11
- gem.homepage = "http://github.com/Narnach/groupie"
12
- gem.authors = ["Wes Oldenbeuving"]
13
- gem.add_development_dependency "testy", ">= 0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- end
16
- Jeweler::GemcutterTasks.new
17
- rescue LoadError
18
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
- end
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
20
5
 
21
- require 'rake/testtask'
22
- Rake::TestTask.new(:spec) do |test|
23
- test.libs << 'lib' << 'spec'
24
- test.pattern = 'spec/**/*_spec.rb'
25
- test.verbose = true
26
- end
6
+ RSpec::Core::RakeTask.new(:spec)
27
7
 
28
- begin
29
- require 'rcov/rcovtask'
30
- Rcov::RcovTask.new do |test|
31
- test.libs << 'spec'
32
- test.pattern = 'spec/**/*_spec.rb'
33
- test.verbose = true
34
- end
35
- rescue LoadError
36
- task :rcov do
37
- abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
- end
39
- end
8
+ require 'rubocop/rake_task'
40
9
 
41
- task :test => :check_dependencies
10
+ RuboCop::RakeTask.new
42
11
 
43
- task :default => :test
44
-
45
- require 'rake/rdoctask'
46
- Rake::RDocTask.new do |rdoc|
47
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
-
49
- rdoc.rdoc_dir = 'rdoc'
50
- rdoc.title = "groupie #{version}"
51
- rdoc.rdoc_files.include('readme*')
52
- rdoc.rdoc_files.include('lib/**/*.rb')
53
- end
12
+ task default: [:spec, :rubocop]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'groupie'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/rubocop ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ bundle exec rubocop --force-exclusion $*
data/bin/setup ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -x
5
+
6
+ bundle install
7
+ bundle clean
8
+
9
+ # Do any other automated setup that you need to do here
data/groupie.gemspec ADDED
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/groupie/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'groupie'
7
+ spec.version = Groupie::VERSION
8
+ spec.authors = ['Wes Oldenbeuving']
9
+ spec.email = ['wes@narnach.com']
10
+
11
+ spec.summary = 'Library to help you group texts and classify new ones'
12
+ spec.description = 'Groupie is a simple way to group texts and classify new texts as being a likely member' \
13
+ ' of one of the defined groups. Think of bayesian spam filters.'
14
+ spec.homepage = 'https://github.com/Narnach/groupie'
15
+ spec.license = 'MIT'
16
+ spec.required_ruby_version = '>= 2.6.0' # EOL for 2.6 is 2022-03-31, so support this as the minimum for now
17
+
18
+ spec.metadata['homepage_uri'] = spec.homepage
19
+ spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
20
+ spec.metadata['changelog_uri'] = 'https://github.com/Narnach/groupie/blob/stable/Changelog.md'
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
26
+ end
27
+ # spec.bindir = "exe"
28
+ # spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ # Uncomment to register a new dependency of your gem
32
+ # spec.add_dependency "example-gem", "~> 1.0"
33
+
34
+ # For more information and examples about making a new gem, checkout our
35
+ # guide at: https://bundler.io/guides/creating_gem.html
36
+ end
@@ -1,12 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
2
4
  module CoreExt
5
+ # This module monkey patches String to respond to #tokenize
3
6
  module String
4
7
  def tokenize
5
- downcase.
6
- gsub(/\s/," ").
7
- gsub(/[$']/,'').
8
- gsub(/<[^>]+?>|[^\w -.,]/,'').
9
- split(" ").map {|str| str.gsub(/\A['"]+|[!,."']+\Z/,'')}
8
+ warn "Please use Groupie.tokenize instead of String#tokenize (from #{caller(1..1).first})"
9
+ Groupie.tokenize(self)
10
10
  end
11
11
  end
12
12
  end
@@ -14,4 +14,4 @@ end
14
14
 
15
15
  class String
16
16
  include Groupie::CoreExt::String
17
- end
17
+ end
data/lib/groupie/group.rb CHANGED
@@ -1,6 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
4
+ # Group represents a group or category that words can be classified into.
2
5
  class Group
3
6
  attr_reader :word_counts
7
+
4
8
  def initialize(name)
5
9
  @name = name
6
10
  @word_counts = {}
@@ -17,6 +21,7 @@ class Groupie
17
21
  end
18
22
  nil
19
23
  end
24
+
20
25
  alias << add
21
26
 
22
27
  # Return the count for a specific +word+.
@@ -24,11 +29,12 @@ class Groupie
24
29
  @word_counts[word] || 0
25
30
  end
26
31
 
32
+ private
33
+
27
34
  # Add a single word and count it.
28
35
  def add_word(word)
29
36
  @word_counts[word] ||= 0
30
37
  @word_counts[word] += 1
31
38
  end
32
- private :add_word
33
39
  end
34
- end
40
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This extends Groupie and adds a version number
4
+ class Groupie
5
+ VERSION = '0.4.0'
6
+
7
+ def self.version
8
+ VERSION
9
+ end
10
+ end
data/lib/groupie.rb CHANGED
@@ -1,86 +1,133 @@
1
- lib_dir = File.expand_path(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
- require 'groupie/group'
4
- require 'groupie/core_ext/string'
1
+ # frozen_string_literal: true
5
2
 
3
+ require_relative 'groupie/version'
4
+ require_relative 'groupie/group'
5
+ require_relative 'groupie/core_ext/string'
6
+
7
+ # Groupie is a text grouper and classifier, using naive Bayesian filtering.
6
8
  class Groupie
9
+ # Wrap all errors we raise in this so our own errors are recognizable.
10
+ class Error < StandardError; end
11
+
7
12
  def initialize
8
13
  @groups = {}
9
14
  end
10
15
 
16
+ # Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
17
+ # This attempts to remove most common punctuation marks and types of whitespace.
18
+ #
19
+ # @param [String, #to_s] object
20
+ # @return [Array<String>]
21
+ def self.tokenize(object)
22
+ object
23
+ .to_s
24
+ .downcase
25
+ .gsub(/\s/, ' ')
26
+ .gsub(/[$']/, '')
27
+ .gsub(/<[^>]+?>|[^\w -.,]/, '')
28
+ .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
29
+ end
30
+
31
+ # Access an existing Group or create a new one.
32
+ #
33
+ # @param [Object] group The name of the group to access.
34
+ # @return [Groupie::Group] An existing or new group identified by +group+.
11
35
  def [](group)
12
36
  @groups[group] ||= Group.new(group)
13
37
  end
14
38
 
15
- def unique_words
16
- @unique_words ||= (
17
- total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
18
- median_index = [total_count.values.size * 3 / 4 - 1, 1].max
19
- median_frequency = total_count.values.sort[median_index]
20
- total_count.select{|word, count| count <= median_frequency}.map(&:first)
21
- )
39
+ # Classify a text by taking the average of all word classifications.
40
+ #
41
+ # @param [Array<String>] words List of words to be classified
42
+ # @param [Symbol] strategy
43
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
44
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
45
+ def classify_text(words, strategy = :sum)
46
+ words &= unique_words if strategy == :unique
47
+ group_score_sums, hits = calculate_group_scores(words, strategy)
48
+
49
+ group_score_sums.each.with_object({}) do |(group, sum), averages|
50
+ averages[group] = hits.positive? ? sum / hits : 0
51
+ end
22
52
  end
23
53
 
24
- def classify(entry, strategy=:sum)
54
+ # Classify a single word against all groups.
55
+ #
56
+ # @param [String] entry A word to be classified
57
+ # @param [Symbol] strategy
58
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
59
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
60
+ def classify(entry, strategy = :sum)
25
61
  results = {}
26
- total_count = @groups.inject(0) do |sum, name_group|
27
- group = name_group.last
28
- count = group.count(entry)
29
- case strategy
30
- when :sum
31
- sum += count
32
- when :sqrt, :unique
33
- sum += Math::sqrt(count)
34
- when :log
35
- sum += Math::log10(count) if count > 0
36
- else
37
- raise "Invalid strategy: #{strategy}"
38
- end
39
- next sum
62
+ total_count = @groups.values.inject(0) do |sum, group|
63
+ sum + apply_count_strategy(group.count(entry), strategy)
40
64
  end
41
- return results if 0 == total_count
65
+ return results if total_count.zero?
42
66
 
43
67
  @groups.each do |name, group|
44
- count = group.count(entry)
45
- case strategy
46
- when :sum
47
- # keep count
48
- when :sqrt, :unique
49
- count = Math::sqrt(count)
50
- when :log
51
- count = Math::log10(count) if count > 0
52
- else
53
- raise "Invalid strategy: #{strategy}"
54
- end
55
- results[name] = count > 0 ? count.to_f / total_count : 0.0
68
+ count = apply_count_strategy(group.count(entry), strategy)
69
+ results[name] = count.positive? ? count.to_f / total_count : 0.0
56
70
  end
57
- return results
71
+
72
+ results
58
73
  end
59
74
 
60
- # Classify a text by taking the average of all word classifications.
61
- def classify_text(words, strategy=:sum)
62
- hits = 0
63
- if strategy==:unique
64
- words = words & unique_words
75
+ # Return a word score dictionary that excludes the 4th quartile most popular words.
76
+ # Why do this? So the most common (and thus meaningless) words are ignored
77
+ # and less common words gain more predictive power.
78
+ #
79
+ # This is used by the :unique strategy of the classifier.
80
+ #
81
+ # @return [Hash<String, Integer>]
82
+ def unique_words
83
+ # Iterate over all Groups and merge their <word, count> dictionaries into one
84
+ total_count = @groups.inject({}) do |total, (_name, group)|
85
+ total.merge!(group.word_counts) { |_key, o, n| o + n }
65
86
  end
66
- group_score_sums = words.inject({}) do |results, word|
87
+ # Extract the word count that's at the top 75%
88
+ top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
89
+ top_quartile_frequency = total_count.values.sort[top_quartile_index]
90
+ # Throw out all words which have a count that's above this frequency
91
+ total_count.reject! { |_word, count| count > top_quartile_frequency }
92
+ total_count.keys
93
+ end
94
+
95
+ private
96
+
97
+ # Calculate grouped scores
98
+ #
99
+ # @param [Array<String>] words
100
+ # @param [Symbol] strategy
101
+ # @return [Array<Enumerator<String>, Integer>] a Hash with <group, score> pairs and an integer with the number of hits
102
+ def calculate_group_scores(words, strategy)
103
+ hits = 0
104
+ group_score_sums = words.each.with_object({}) do |word, results|
67
105
  word_results = classify(word, strategy)
68
106
  next results if word_results.empty?
69
- hits += 1
70
- results.merge(word_results) do |key, old, new|
71
- old + new
72
- end
73
- end
74
107
 
75
- averages={}
76
- group_score_sums.each do |group, sum|
77
- averages[group] = hits > 0 ? sum / hits : 0
108
+ hits += 1
109
+ results.merge!(word_results) { |_key, old, new| old + new }
78
110
  end
79
111
 
80
- averages
112
+ [group_score_sums, hits]
81
113
  end
82
114
 
83
- def self.version
84
- File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
115
+ # Helper function to reduce a raw word count to a strategy-modified weight.
116
+ # @param [Integer] count
117
+ # @param [Symbol] strategy
118
+ # @return [Integer, Float]
119
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
120
+ def apply_count_strategy(count, strategy)
121
+ case strategy
122
+ when :sum
123
+ # keep count
124
+ when :sqrt, :unique
125
+ count = Math.sqrt(count)
126
+ when :log
127
+ count = Math.log10(count) if count.positive?
128
+ else
129
+ raise Error, "Invalid strategy: #{strategy}"
130
+ end
131
+ count
85
132
  end
86
- end
133
+ end
metadata CHANGED
@@ -1,100 +1,68 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
- version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 3
9
- - 0
10
- version: 0.3.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Wes Oldenbeuving
14
- autorequire:
8
+ autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2010-07-29 00:00:00 +02:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- name: testy
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
- version: "0"
33
- type: :development
34
- version_requirements: *id001
35
- description: Group and classify text based on likelyhood of being included in a text of a specific category
36
- email: narnach@gmail.com
11
+ date: 2021-09-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Groupie is a simple way to group texts and classify new texts as being
14
+ a likely member of one of the defined groups. Think of bayesian spam filters.
15
+ email:
16
+ - wes@narnach.com
37
17
  executables: []
38
-
39
18
  extensions: []
40
-
41
- extra_rdoc_files:
42
- - LICENSE
43
- files:
44
- - .document
45
- - .gitignore
46
- - LICENSE
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".github/dependabot.yml"
22
+ - ".github/workflows/gem.yml"
23
+ - ".github/workflows/rspec.yml"
24
+ - ".github/workflows/rubocop.yml"
25
+ - ".gitignore"
26
+ - ".rspec"
27
+ - ".rubocop.yml"
28
+ - CHANGELOG.md
29
+ - Gemfile
30
+ - Gemfile.lock
31
+ - LICENSE.txt
32
+ - README.md
47
33
  - Rakefile
48
- - VERSION
34
+ - bin/console
35
+ - bin/rubocop
36
+ - bin/setup
37
+ - groupie.gemspec
49
38
  - lib/groupie.rb
50
39
  - lib/groupie/core_ext/string.rb
51
40
  - lib/groupie/group.rb
52
- - readme.rdoc
53
- - spec/fixtures/ham/email_ham1.txt
54
- - spec/fixtures/ham/spam.la-44116217.txt
55
- - spec/fixtures/spam/email_spam1.txt
56
- - spec/fixtures/spam/email_spam2.txt
57
- - spec/fixtures/spam/spam.la-44118014.txt
58
- - spec/groupie/core_ext/string_spec.rb
59
- - spec/groupie/group_spec.rb
60
- - spec/groupie_spec.rb
61
- - spec/spec_helper.rb
62
- has_rdoc: true
63
- homepage: http://github.com/Narnach/groupie
64
- licenses: []
65
-
66
- post_install_message:
67
- rdoc_options:
68
- - --charset=UTF-8
69
- require_paths:
41
+ - lib/groupie/version.rb
42
+ homepage: https://github.com/Narnach/groupie
43
+ licenses:
44
+ - MIT
45
+ metadata:
46
+ homepage_uri: https://github.com/Narnach/groupie
47
+ source_code_uri: https://github.com/Narnach/groupie
48
+ changelog_uri: https://github.com/Narnach/groupie/blob/stable/Changelog.md
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
70
52
  - lib
71
- required_ruby_version: !ruby/object:Gem::Requirement
72
- none: false
73
- requirements:
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
74
55
  - - ">="
75
- - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
- version: "0"
80
- required_rubygems_version: !ruby/object:Gem::Requirement
81
- none: false
82
- requirements:
56
+ - !ruby/object:Gem::Version
57
+ version: 2.6.0
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
83
60
  - - ">="
84
- - !ruby/object:Gem::Version
85
- hash: 3
86
- segments:
87
- - 0
88
- version: "0"
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
89
63
  requirements: []
90
-
91
- rubyforge_project:
92
- rubygems_version: 1.3.7
93
- signing_key:
94
- specification_version: 3
95
- summary: Group and classify text
96
- test_files:
97
- - spec/groupie/core_ext/string_spec.rb
98
- - spec/groupie/group_spec.rb
99
- - spec/groupie_spec.rb
100
- - spec/spec_helper.rb
64
+ rubygems_version: 3.2.24
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: Library to help you group texts and classify new ones
68
+ test_files: []