groupie 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,53 +1,12 @@
1
- require 'rubygems'
2
- require 'rake'
1
+ # frozen_string_literal: true
3
2
 
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "groupie"
8
- gem.summary = %Q{Group and classify text}
9
- gem.description = %Q{Group and classify text based on likelyhood of being included in a text of a specific category}
10
- gem.email = "narnach@gmail.com"
11
- gem.homepage = "http://github.com/Narnach/groupie"
12
- gem.authors = ["Wes Oldenbeuving"]
13
- gem.add_development_dependency "testy", ">= 0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- end
16
- Jeweler::GemcutterTasks.new
17
- rescue LoadError
18
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
- end
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
20
5
 
21
- require 'rake/testtask'
22
- Rake::TestTask.new(:test) do |test|
23
- test.libs << 'lib' << 'test'
24
- test.pattern = 'test/**/*_test.rb'
25
- test.verbose = true
26
- end
6
+ RSpec::Core::RakeTask.new(:spec)
27
7
 
28
- begin
29
- require 'rcov/rcovtask'
30
- Rcov::RcovTask.new do |test|
31
- test.libs << 'test'
32
- test.pattern = 'test/**/*_test.rb'
33
- test.verbose = true
34
- end
35
- rescue LoadError
36
- task :rcov do
37
- abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
- end
39
- end
8
+ require 'rubocop/rake_task'
40
9
 
41
- task :test => :check_dependencies
10
+ RuboCop::RakeTask.new
42
11
 
43
- task :default => :test
44
-
45
- require 'rake/rdoctask'
46
- Rake::RDocTask.new do |rdoc|
47
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
-
49
- rdoc.rdoc_dir = 'rdoc'
50
- rdoc.title = "groupie #{version}"
51
- rdoc.rdoc_files.include('readme*')
52
- rdoc.rdoc_files.include('lib/**/*.rb')
53
- end
12
+ task default: [:spec, :rubocop]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'groupie'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/rubocop ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ bundle exec rubocop --force-exclusion $*
data/bin/setup ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -x
5
+
6
+ bundle install
7
+ bundle clean
8
+
9
+ # Do any other automated setup that you need to do here
data/groupie.gemspec CHANGED
@@ -1,60 +1,36 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
- # -*- encoding: utf-8 -*-
1
+ # frozen_string_literal: true
5
2
 
6
- Gem::Specification.new do |s|
7
- s.name = %q{groupie}
8
- s.version = "0.1.0"
3
+ require_relative 'lib/groupie/version'
9
4
 
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Wes Oldenbeuving"]
12
- s.date = %q{2010-07-25}
13
- s.description = %q{Group and classify text based on likelyhood of being included in a text of a specific category}
14
- s.email = %q{narnach@gmail.com}
15
- s.extra_rdoc_files = [
16
- "LICENSE"
17
- ]
18
- s.files = [
19
- ".document",
20
- "LICENSE",
21
- "Rakefile",
22
- "VERSION",
23
- "groupie.gemspec",
24
- "lib/groupie.rb",
25
- "lib/groupie/core_ext/string.rb",
26
- "lib/groupie/group.rb",
27
- "readme.rdoc",
28
- "test/fixtures/ham/spam.la-44116217.txt",
29
- "test/fixtures/spam/spam.la-44118014.txt",
30
- "test/groupie/core_ext/string_test.rb",
31
- "test/groupie/group_test.rb",
32
- "test/groupie_test.rb",
33
- "test/test_helper.rb"
34
- ]
35
- s.homepage = %q{http://github.com/Narnach/groupie}
36
- s.rdoc_options = ["--charset=UTF-8"]
37
- s.require_paths = ["lib"]
38
- s.rubygems_version = %q{1.3.7}
39
- s.summary = %q{Group and classify text}
40
- s.test_files = [
41
- "test/groupie/core_ext/string_test.rb",
42
- "test/groupie/group_test.rb",
43
- "test/groupie_test.rb",
44
- "test/test_helper.rb"
45
- ]
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'groupie'
7
+ spec.version = Groupie::VERSION
8
+ spec.authors = ['Wes Oldenbeuving']
9
+ spec.email = ['wes@narnach.com']
46
10
 
47
- if s.respond_to? :specification_version then
48
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
- s.specification_version = 3
11
+ spec.summary = 'Library to help you group texts and classify new ones'
12
+ spec.description = 'Groupie is a simple way to group texts and classify new texts as being a likely member' \
13
+ ' of one of the defined groups. Think of bayesian spam filters.'
14
+ spec.homepage = 'https://github.com/Narnach/groupie'
15
+ spec.license = 'MIT'
16
+ spec.required_ruby_version = '>= 2.6.0' # EOL for 2.6 is 2022-03-31, so support this as the minimum for now
50
17
 
51
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
- s.add_development_dependency(%q<testy>, [">= 0"])
53
- else
54
- s.add_dependency(%q<testy>, [">= 0"])
55
- end
56
- else
57
- s.add_dependency(%q<testy>, [">= 0"])
18
+ spec.metadata['homepage_uri'] = spec.homepage
19
+ spec.metadata['source_code_uri'] = 'https://github.com/Narnach/groupie'
20
+ spec.metadata['changelog_uri'] = 'https://github.com/Narnach/groupie/blob/stable/Changelog.md'
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
58
26
  end
59
- end
27
+ # spec.bindir = "exe"
28
+ # spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
60
30
 
31
+ # Uncomment to register a new dependency of your gem
32
+ # spec.add_dependency "example-gem", "~> 1.0"
33
+
34
+ # For more information and examples about making a new gem, checkout our
35
+ # guide at: https://bundler.io/guides/creating_gem.html
36
+ end
@@ -1,12 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
2
4
  module CoreExt
5
+ # This module monkey patches String to respond to #tokenize
3
6
  module String
4
7
  def tokenize
5
- downcase.
6
- gsub(/\s/," ").
7
- gsub(/[$']/,'').
8
- gsub(/<[^>]+?>|[^\w -.,]/,'').
9
- split(" ").map {|str| str.gsub(/[,.]+\Z/,'')}
8
+ warn "Please use Groupie.tokenize instead of String#tokenize (from #{caller(1..1).first})"
9
+ Groupie.tokenize(self)
10
10
  end
11
11
  end
12
12
  end
@@ -14,4 +14,4 @@ end
14
14
 
15
15
  class String
16
16
  include Groupie::CoreExt::String
17
- end
17
+ end
data/lib/groupie/group.rb CHANGED
@@ -1,10 +1,15 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Groupie
4
+ # Group represents a group or category that words can be classified into.
2
5
  class Group
6
+ attr_reader :word_counts
7
+
3
8
  def initialize(name)
4
9
  @name = name
5
10
  @word_counts = {}
6
11
  end
7
-
12
+
8
13
  def words
9
14
  @word_counts.keys
10
15
  end
@@ -17,16 +22,19 @@ class Groupie
17
22
  nil
18
23
  end
19
24
 
25
+ alias << add
26
+
20
27
  # Return the count for a specific +word+.
21
28
  def count(word)
22
29
  @word_counts[word] || 0
23
30
  end
24
31
 
32
+ private
33
+
25
34
  # Add a single word and count it.
26
35
  def add_word(word)
27
36
  @word_counts[word] ||= 0
28
37
  @word_counts[word] += 1
29
38
  end
30
- private :add_word
31
39
  end
32
- end
40
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This extends Groupie and adds a version number
4
+ class Groupie
5
+ VERSION = '0.4.0'
6
+
7
+ def self.version
8
+ VERSION
9
+ end
10
+ end
data/lib/groupie.rb CHANGED
@@ -1,51 +1,133 @@
1
- lib_dir = File.expand_path(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
3
- require 'groupie/group'
4
- require 'groupie/core_ext/string'
1
+ # frozen_string_literal: true
5
2
 
3
+ require_relative 'groupie/version'
4
+ require_relative 'groupie/group'
5
+ require_relative 'groupie/core_ext/string'
6
+
7
+ # Groupie is a text grouper and classifier, using naive Bayesian filtering.
6
8
  class Groupie
9
+ # Wrap all errors we raise in this so our own errors are recognizable.
10
+ class Error < StandardError; end
11
+
7
12
  def initialize
8
13
  @groups = {}
9
14
  end
10
15
 
16
+ # Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
17
+ # This attempts to remove most common punctuation marks and types of whitespace.
18
+ #
19
+ # @param [String, #to_s] object
20
+ # @return [Array<String>]
21
+ def self.tokenize(object)
22
+ object
23
+ .to_s
24
+ .downcase
25
+ .gsub(/\s/, ' ')
26
+ .gsub(/[$']/, '')
27
+ .gsub(/<[^>]+?>|[^\w -.,]/, '')
28
+ .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
29
+ end
30
+
31
+ # Access an existing Group or create a new one.
32
+ #
33
+ # @param [Object] group The name of the group to access.
34
+ # @return [Groupie::Group] An existing or new group identified by +group+.
11
35
  def [](group)
12
36
  @groups[group] ||= Group.new(group)
13
37
  end
14
38
 
15
- def classify(entry)
39
+ # Classify a text by taking the average of all word classifications.
40
+ #
41
+ # @param [Array<String>] words List of words to be classified
42
+ # @param [Symbol] strategy
43
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
44
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
45
+ def classify_text(words, strategy = :sum)
46
+ words &= unique_words if strategy == :unique
47
+ group_score_sums, hits = calculate_group_scores(words, strategy)
48
+
49
+ group_score_sums.each.with_object({}) do |(group, sum), averages|
50
+ averages[group] = hits.positive? ? sum / hits : 0
51
+ end
52
+ end
53
+
54
+ # Classify a single word against all groups.
55
+ #
56
+ # @param [String] entry A word to be classified
57
+ # @param [Symbol] strategy
58
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
59
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
60
+ def classify(entry, strategy = :sum)
16
61
  results = {}
17
- total_count = @groups.inject(0) do |sum, name_group|
18
- group = name_group.last
19
- sum + group.count(entry)
62
+ total_count = @groups.values.inject(0) do |sum, group|
63
+ sum + apply_count_strategy(group.count(entry), strategy)
20
64
  end
21
- return results if 0 == total_count
65
+ return results if total_count.zero?
22
66
 
23
67
  @groups.each do |name, group|
24
- count = group.count(entry)
25
- results[name] = count > 0 ? count.to_f / total_count : 0.0
68
+ count = apply_count_strategy(group.count(entry), strategy)
69
+ results[name] = count.positive? ? count.to_f / total_count : 0.0
26
70
  end
27
- return results
71
+
72
+ results
28
73
  end
29
74
 
30
- # Classify a text by taking the average of all word classifications.
31
- def classify_text(words)
32
- group_score_sums = words.inject({}) do |results, word|
33
- word_results = classify(word)
34
- results.merge(word_results) do |key, old, new|
35
- old + new
36
- end
75
+ # Return a word score dictionary that excludes the 4th quartile most popular words.
76
+ # Why do this? So the most common (and thus meaningless) words are ignored
77
+ # and less common words gain more predictive power.
78
+ #
79
+ # This is used by the :unique strategy of the classifier.
80
+ #
81
+ # @return [Hash<String, Integer>]
82
+ def unique_words
83
+ # Iterate over all Groups and merge their <word, count> dictionaries into one
84
+ total_count = @groups.inject({}) do |total, (_name, group)|
85
+ total.merge!(group.word_counts) { |_key, o, n| o + n }
37
86
  end
87
+ # Extract the word count that's at the top 75%
88
+ top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
89
+ top_quartile_frequency = total_count.values.sort[top_quartile_index]
90
+ # Throw out all words which have a count that's above this frequency
91
+ total_count.reject! { |_word, count| count > top_quartile_frequency }
92
+ total_count.keys
93
+ end
38
94
 
39
- words_count = words.size.to_f
40
- averages={}
41
- group_score_sums.each do |group, sum|
42
- averages[group] = sum / words_count
95
+ private
96
+
97
+ # Calculate grouped scores
98
+ #
99
+ # @param [Array<String>] words
100
+ # @param [Symbol] strategy
101
+ # @return [Array<Enumerator<String>, Integer>] a Hash with <group, score> pairs and an integer with the number of hits
102
+ def calculate_group_scores(words, strategy)
103
+ hits = 0
104
+ group_score_sums = words.each.with_object({}) do |word, results|
105
+ word_results = classify(word, strategy)
106
+ next results if word_results.empty?
107
+
108
+ hits += 1
109
+ results.merge!(word_results) { |_key, old, new| old + new }
43
110
  end
44
111
 
45
- averages
112
+ [group_score_sums, hits]
46
113
  end
47
-
48
- def self.version
49
- File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
114
+
115
+ # Helper function to reduce a raw word count to a strategy-modified weight.
116
+ # @param [Integer] count
117
+ # @param [Symbol] strategy
118
+ # @return [Integer, Float]
119
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
120
+ def apply_count_strategy(count, strategy)
121
+ case strategy
122
+ when :sum
123
+ # keep count
124
+ when :sqrt, :unique
125
+ count = Math.sqrt(count)
126
+ when :log
127
+ count = Math.log10(count) if count.positive?
128
+ else
129
+ raise Error, "Invalid strategy: #{strategy}"
130
+ end
131
+ count
50
132
  end
51
- end
133
+ end
metadata CHANGED
@@ -1,97 +1,68 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: groupie
3
- version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 1
9
- - 0
10
- version: 0.1.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Wes Oldenbeuving
14
- autorequire:
8
+ autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2010-07-25 00:00:00 +02:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- name: testy
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
- version: "0"
33
- type: :development
34
- version_requirements: *id001
35
- description: Group and classify text based on likelyhood of being included in a text of a specific category
36
- email: narnach@gmail.com
11
+ date: 2021-09-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Groupie is a simple way to group texts and classify new texts as being
14
+ a likely member of one of the defined groups. Think of bayesian spam filters.
15
+ email:
16
+ - wes@narnach.com
37
17
  executables: []
38
-
39
18
  extensions: []
40
-
41
- extra_rdoc_files:
42
- - LICENSE
43
- files:
44
- - .document
45
- - LICENSE
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".github/dependabot.yml"
22
+ - ".github/workflows/gem.yml"
23
+ - ".github/workflows/rspec.yml"
24
+ - ".github/workflows/rubocop.yml"
25
+ - ".gitignore"
26
+ - ".rspec"
27
+ - ".rubocop.yml"
28
+ - CHANGELOG.md
29
+ - Gemfile
30
+ - Gemfile.lock
31
+ - LICENSE.txt
32
+ - README.md
46
33
  - Rakefile
47
- - VERSION
34
+ - bin/console
35
+ - bin/rubocop
36
+ - bin/setup
48
37
  - groupie.gemspec
49
38
  - lib/groupie.rb
50
39
  - lib/groupie/core_ext/string.rb
51
40
  - lib/groupie/group.rb
52
- - readme.rdoc
53
- - test/fixtures/ham/spam.la-44116217.txt
54
- - test/fixtures/spam/spam.la-44118014.txt
55
- - test/groupie/core_ext/string_test.rb
56
- - test/groupie/group_test.rb
57
- - test/groupie_test.rb
58
- - test/test_helper.rb
59
- has_rdoc: true
60
- homepage: http://github.com/Narnach/groupie
61
- licenses: []
62
-
63
- post_install_message:
64
- rdoc_options:
65
- - --charset=UTF-8
66
- require_paths:
41
+ - lib/groupie/version.rb
42
+ homepage: https://github.com/Narnach/groupie
43
+ licenses:
44
+ - MIT
45
+ metadata:
46
+ homepage_uri: https://github.com/Narnach/groupie
47
+ source_code_uri: https://github.com/Narnach/groupie
48
+ changelog_uri: https://github.com/Narnach/groupie/blob/stable/Changelog.md
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
67
52
  - lib
68
- required_ruby_version: !ruby/object:Gem::Requirement
69
- none: false
70
- requirements:
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
71
55
  - - ">="
72
- - !ruby/object:Gem::Version
73
- hash: 3
74
- segments:
75
- - 0
76
- version: "0"
77
- required_rubygems_version: !ruby/object:Gem::Requirement
78
- none: false
79
- requirements:
56
+ - !ruby/object:Gem::Version
57
+ version: 2.6.0
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
80
60
  - - ">="
81
- - !ruby/object:Gem::Version
82
- hash: 3
83
- segments:
84
- - 0
85
- version: "0"
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
86
63
  requirements: []
87
-
88
- rubyforge_project:
89
- rubygems_version: 1.3.7
90
- signing_key:
91
- specification_version: 3
92
- summary: Group and classify text
93
- test_files:
94
- - test/groupie/core_ext/string_test.rb
95
- - test/groupie/group_test.rb
96
- - test/groupie_test.rb
97
- - test/test_helper.rb
64
+ rubygems_version: 3.2.24
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: Library to help you group texts and classify new ones
68
+ test_files: []