RubyGems - words_counted - Versions diffs - 0.1.5 → 1.0.3 - Mend

words_counted 0.1.5 → 1.0.3

Files changed (21) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.hound.yml +2 -0
data/.ruby-style.yml +2 -0
data/.ruby-version +1 -0
data/.travis.yml +9 -0
data/.yardopts +3 -2
data/CHANGELOG.md +29 -0
data/README.md +146 -189
data/lib/refinements/hash_refinements.rb +14 -0
data/lib/words_counted/counter.rb +113 -72
data/lib/words_counted/deprecated.rb +78 -0
data/lib/words_counted/tokeniser.rb +163 -0
data/lib/words_counted/version.rb +1 -1
data/lib/words_counted.rb +31 -4
data/spec/words_counted/counter_spec.rb +49 -204
data/spec/words_counted/deprecated_spec.rb +99 -0
data/spec/words_counted/tokeniser_spec.rb +133 -0
data/spec/words_counted_spec.rb +34 -0
data/words_counted.gemspec +2 -2
metadata +25 -12

data/spec/words_counted/counter_spec.rb CHANGED Viewed

@@ -3,240 +3,85 @@ require_relative "../spec_helper"
 module WordsCounted
   describe Counter do
-    let(:counter) { Counter.new("We are all in the gutter, but some of us are looking at the stars.") }
-    describe "initialize" do
-      it "sets @options" do
-        expect(counter.instance_variables).to include(:@options)
-      end
-      it "sets @char_count" do
-        expect(counter.instance_variables).to include(:@char_count)
-      end
-      it "sets @words" do
-        expect(counter.instance_variables).to include(:@words)
-      end
-      it "sets @word_occurrences" do
-        expect(counter.instance_variables).to include(:@word_occurrences)
-      end
-      it "sets @word_lengths" do
-        expect(counter.instance_variables).to include(:@word_lengths)
-      end
+    let(:counter) do
+      tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
+      Counter.new(tokens)
     end
-    describe "words" do
-      it "returns an array" do
-        expect(counter.words).to be_a(Array)
-      end
-      it "splits words" do
-        expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
-      end
-      it "removes special characters" do
-        counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
-        expect(counter.words).to eq(%w[hello how do you do])
-      end
-      it "counts hyphenated words as one" do
-        counter = Counter.new("I am twenty-two.")
-        expect(counter.words).to eq(%w[i am twenty-two])
-      end
-      it "does not split words on apostrophe" do
-        counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
-        expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
-      end
-      it "does not split on unicode chars" do
-        counter = Counter.new("São Paulo")
-        expect(counter.words).to eq(%w[são paulo])
-      end
-      it "it accepts a string filter" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
-        expect(counter.words).to eq(%w[that was trevor])
-      end
-      it "it accepts a string filter with multiple words" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
-        expect(counter.words).to eq(%w[that trevor])
-      end
-      it "filters words in uppercase when using a string filter" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
-        expect(counter.words).to eq(%w[that was trevor])
-      end
-      it "accepts a regexp filter" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
-        expect(counter.words).to eq(%w[that was trevor])
-      end
-      it "accepts an array filter" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
-        expect(counter.words).to eq(%w[magnificent trevor])
-      end
-      it "accepts a lambda filter" do
-        counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
-        expect(counter.words).to eq(%w[was magnificent trevor])
-      end
-      it "accepts a custom regexp" do
-        counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
-        expect(counter.words).to eq(["i", "am", "007"])
-      end
-      it "char_count should be calculated after the filter is applied" do
-        counter = Counter.new("I am Legend.", exclude: "I am")
-        expect(counter.char_count).to eq(6)
-      end
-    end
-    describe "word_count" do
-      it "returns the correct word count" do
-        expect(counter.word_count).to eq(15)
+    describe "initialize" do
+      it "sets @tokens" do
+        expect(counter.instance_variables).to include(:@tokens)
       end
     end
-    describe "word_occurrences" do
-      it "returns a hash" do
-        expect(counter.word_occurrences).to be_a(Hash)
-      end
-      it "treats capitalized words as the same word" do
-        counter = Counter.new("Bad, bad, piggy!")
-        expect(counter.word_occurrences).to eq({ "bad" => 2, "piggy" => 1 })
+    describe "#token_count" do
+      it "returns the correct number of tokens" do
+        expect(counter.token_count).to eq(6)
       end
     end
-    describe "sorted_word_occurrences" do
-      it "returns an array" do
-        expect(counter.sorted_word_occurrences).to be_a(Array)
-      end
-      it "returns a two dimensional array sorted by descending word occurrence" do
-        counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
-        expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
+    describe "#uniq_token_count" do
+      it "returns the number of unique token" do
+        expect(counter.uniq_token_count).to eq(3)
       end
     end
-    describe "most_occurring_words" do
-      it "returns an array" do
-        expect(counter.most_occurring_words).to be_a(Array)
-      end
-      it "returns highest occuring words" do
-        counter = Counter.new("Orange orange Apple apple banana")
-        expect(counter.most_occurring_words).to eq([["orange", 2],["apple", 2]])
+    describe "#char_count" do
+      it "returns the correct number of chars" do
+        expect(counter.char_count).to eq(26)
       end
     end
-    describe 'word_lengths' do
-      it "returns a hash" do
-        expect(counter.word_lengths).to be_a(Hash)
-      end
-      it "returns a hash of word lengths" do
-        counter = Counter.new("One two three.")
-        expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
+    describe "#token_frequency" do
+      it "returns a two-dimensional array where each member array is a token and its frequency in descending order" do
+        expected = [
+          ['three', 3], ['woot', 2], ['one', 1]
+        ]
+        expect(counter.token_frequency).to eq(expected)
       end
     end
-    describe "sorted_word_lengths" do
-      it "returns an array" do
-        expect(counter.sorted_word_lengths).to be_a(Array)
-      end
-      it "returns a two dimensional array sorted by descending word length" do
-        counter = Counter.new("I am not certain of that")
-        expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
+    describe "#token_lengths" do
+      it "returns a two-dimensional array where each member array is a token and its length in descending order" do
+        expected = [
+          ['three', 5], ['woot', 4], ['one', 3]
+        ]
+        expect(counter.token_lengths).to eq(expected)
       end
     end
-    describe "longest_words" do
-      it "returns an array" do
-        expect(counter.longest_words).to be_a(Array)
-      end
-      it "returns the longest words" do
-        counter = Counter.new("Those whom the gods love grow young.")
-        expect(counter.longest_words).to eq([["those", 5],["young", 5]])
-      end
-    end
-    describe "word_density" do
-      it "returns an array" do
-        expect(counter.word_density).to be_a(Array)
-      end
-      it "returns words and their density in percent" do
-        counter = Counter.new("His name was Major, major Major Major.")
-        expect(counter.word_density).to eq([["major", 57.14], ["was", 14.29], ["name", 14.29], ["his", 14.29]])
+    describe "#token_density" do
+      it "returns a two-dimensional array where each member array is a token and its density in descending order" do
+        expected = [
+          ['three', 0.5], ['woot', 0.33], ['one', 0.17]
+        ]
+        expect(counter.token_density).to eq(expected)
       end
       it "accepts a precision" do
-        counter = Counter.new("His name was Major, major Major Major.")
-        expect(counter.word_density(4)).to eq([["major", 57.1429], ["was", 14.2857], ["name", 14.2857], ["his", 14.2857]])
+        expected = [
+          ['three', 0.5], ['woot', 0.3333], ['one', 0.1667]
+        ]
+        expect(counter.token_density(precision: 4)).to eq(expected)
       end
     end
-    describe "char_count" do
-      it "returns the number of chars in the passed in string" do
-        counter = Counter.new("His name was Major, major Major Major.")
-        expect(counter.char_count).to eq(30)
-      end
-      it "returns the number of chars in the passed in string after the filter is applied" do
-        counter = Counter.new("His name was Major, major Major Major.", exclude: "Major")
-        expect(counter.char_count).to eq(10)
-      end
-    end
-    describe "average_chars_per_word" do
-      it "returns the average number of chars per word" do
-        counter = Counter.new("His name was major, Major Major Major.")
-        expect(counter.average_chars_per_word).to eq(4.29)
-      end
-      it "returns the average number of chars per word after the filter is applied" do
-        counter = Counter.new("His name was Major, Major Major Major.", exclude: "Major")
-        expect(counter.average_chars_per_word).to eq(3.33)
-      end
-      it "accepts precision" do
-        counter = Counter.new("This line should have 39 characters minus spaces.")
-        expect(counter.average_chars_per_word(4)).to eq(5.5714)
+    describe "#most_frequent_tokens" do
+      it "returns a hash of the tokens with the highest frequency, where each key a token, and each value is its frequency" do
+        expected = {
+          'three' => 3
+        }
+        expect(counter.most_frequent_tokens).to eq(expected)
       end
     end
-    describe "unique_word_count" do
-      it "returns the number of unique words" do
-        expect(counter.unique_word_count).to eq(13)
-      end
-      it "is case insensitive" do
-        counter = Counter.new("Up down. Down up.")
-        expect(counter.unique_word_count).to eq(2)
+    describe "#longest_tokens" do
+      it "returns a hash of the tokens with the highest length, where each key a token, and each value is its length" do
+        expected = {
+          'three' => 5
+        }
+        expect(counter.longest_tokens).to eq(expected)
       end
     end
   end
-  describe "count" do
-    it "returns count for a single word" do
-      counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
-      expect(counter.count("i")).to eq(3)
-    end
-  end
-  describe "from_file" do
-    it "opens and reads a text file" do
-      counter = WordsCounted.from_file('spec/support/the_hart_and_the_hunter.txt')
-      expect(counter.word_count).to eq(139)
-    end
-  end
 end

data/spec/words_counted/deprecated_spec.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+require_relative "../spec_helper"
+module WordsCounted
+  warn "Methods being tested are deprecated"
+  describe Counter do
+    let(:counter) do
+      tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
+      Counter.new(tokens)
+    end
+    describe "#word_density" do
+      it "returns words and their density in percent" do
+        expected = [
+          ['three', 50.0], ['woot', 33.33], ['one', 16.67]
+        ]
+        expect(counter.word_density).to eq(expected)
+      end
+      it "accepts a precision" do
+        expected = [
+          ['three', 50.0], ['woot', 33.3333], ['one', 16.6667]
+        ]
+        expect(counter.word_density(4)).to eq(expected)
+      end
+    end
+    describe "#word_occurrences" do
+      it "returns a two dimensional array sorted by descending word occurrence" do
+        expected = {
+          'three' => 3, 'woot' => 2, 'one' => 1
+        }
+        expect(counter.word_occurrences).to eq(expected)
+      end
+    end
+    describe "#sorted_word_occurrences" do
+      it "returns a two dimensional array sorted by descending word occurrence" do
+        expected = [
+          ['three', 3], ['woot', 2], ['one', 1]
+        ]
+        expect(counter.sorted_word_occurrences).to eq(expected)
+      end
+    end
+    describe "#word_lengths" do
+      it "returns a hash of of words and their length sorted descending by length" do
+        expected = {
+          'three' => 5, 'woot' => 4, 'one' => 3
+        }
+        expect(counter.word_lengths).to eq(expected)
+      end
+    end
+    describe "#sorted_word_lengths" do
+      it "returns a two dimensional array sorted by descending word length" do
+        expected = [
+          ['three', 5], ['woot', 4], ['one', 3]
+        ]
+        expect(counter.sorted_word_lengths).to eq(expected)
+      end
+    end
+    describe "#longest_words" do
+      it "returns a two-dimentional array of the longest words and their lengths" do
+        expected = [
+          ['three', 5]
+        ]
+        expect(counter.longest_words).to eq(expected)
+      end
+    end
+    describe "#most_occurring_words" do
+      it "returns a two-dimentional array of words with the highest frequency and their frequencies" do
+        expected = [
+          ['three', 3]
+        ]
+        expect(counter.most_occurring_words).to eq(expected)
+      end
+    end
+    describe "#average_chars_per_word" do
+      it "returns the average number of chars per word" do
+        expect(counter.average_chars_per_word).to eq(4.33)
+      end
+      it "accepts precision" do
+        expect(counter.average_chars_per_word(4)).to eq(4.3333)
+      end
+    end
+    describe "#count" do
+      it "returns count for a single word" do
+        expect(counter.count('one')).to eq(1)
+      end
+    end
+  end
+end

data/spec/words_counted/tokeniser_spec.rb ADDED Viewed

@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+require_relative "../spec_helper"
+module WordsCounted
+  describe Tokeniser do
+    describe "initialize" do
+      it "sets @input" do
+        tokeniser = Tokeniser.new("Hello World!")
+        expect(tokeniser.instance_variables).to include(:@input)
+      end
+    end
+    describe "#tokenise" do
+      it "normalises tokens and returns an array" do
+        tokens = Tokeniser.new("Hello HELLO").tokenise
+        expect(tokens).to eq(%w[hello hello])
+      end
+      context "without arguments" do
+        it "removes none alpha-numeric chars" do
+          tokens = Tokeniser.new("Hello world! # $ % 12345 * & % ?").tokenise
+          expect(tokens).to eq(%w[hello world])
+        end
+        it "does not split on hyphens" do
+          tokens = Tokeniser.new("I am twenty-two.").tokenise
+          expect(tokens).to eq(%w[i am twenty-two])
+        end
+        it "does not split on apostrophe" do
+          tokens = Tokeniser.new("Bust 'em! It's Jim's gang.").tokenise
+          expect(tokens).to eq(%w[bust 'em it's jim's gang])
+        end
+        it "does not split on unicode chars" do
+          tokens = Tokeniser.new("Bayrūt").tokenise
+          expect(tokens).to eq(%w[bayrūt])
+        end
+      end
+      context "with `pattern` options" do
+        it "splits on accepts a custom pattern" do
+          tokens = Tokeniser.new("We-Are-ALL").tokenise(pattern: /[^-]+/)
+          expect(tokens).to eq(%w[we are all])
+        end
+      end
+      context "with `exclude` option" do
+        context "as a string" do
+          let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
+          it "it accepts a string filter" do
+            tokens = tokeniser.tokenise(exclude: "magnificent")
+            expect(tokens).to eq(%w[that was trevor])
+          end
+          it "accepts a string filter with multiple space-delimited tokens" do
+            tokens = tokeniser.tokenise(exclude: "was magnificent")
+            expect(tokens).to eq(%w[that trevor])
+          end
+          it "normalises string filter" do
+            tokens = tokeniser.tokenise(exclude: "MAGNIFICENT")
+            expect(tokens).to eq(%w[that was trevor])
+          end
+        end
+        context "as a regular expression" do
+          it "filters on match" do
+            tokeniser = Tokeniser.new("That was magnificent, Trevor.")
+            tokens = tokeniser.tokenise(exclude: /magnificent/i)
+            expect(tokens).to eq(%w[that was trevor])
+          end
+        end
+        context "as a lambda" do
+          it "calls lambda" do
+            tokeniser = Tokeniser.new("That was magnificent, Trevor.")
+            tokens = tokeniser.tokenise(exclude: ->(token) { token.length < 5 })
+            expect(tokens).to eq(%w[magnificent trevor])
+          end
+          it "accepts a symbol for shorthand notation" do
+            tokeniser = Tokeniser.new("That was magnificent, محمد.}")
+            tokens = tokeniser.tokenise(exclude: :ascii_only?)
+            expect(tokens).to eq(%w[محمد])
+          end
+        end
+        context "as an array" do
+          let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
+          it "accepts an array of strings" do
+            tokens = tokeniser.tokenise(exclude: ["magnificent"])
+            expect(tokens).to eq(%w[that was trevor])
+          end
+          it "accepts an array regular expressions" do
+            tokens = tokeniser.tokenise(exclude: [/that/, /was/])
+            expect(tokens).to eq(%w[magnificent trevor])
+          end
+          it "accepts an array of lambdas" do
+            filters = [
+              ->(token) { token.length < 4 },
+              ->(token) { token.length > 6 }
+            ]
+            tokens = tokeniser.tokenise(exclude: filters)
+            expect(tokens).to eq(%w[that trevor])
+          end
+          it "accepts a mixed array" do
+            filters = [
+              "that",
+              ->(token) { token.length < 4 },
+              /magnificent/
+            ]
+            tokens = tokeniser.tokenise(exclude: filters)
+            expect(tokens).to eq(["trevor"])
+          end
+        end
+        context "with an invalid filter" do
+          it "raises an `ArgumentError`" do
+            expect {
+              Tokeniser.new("Hello world!").tokenise(exclude: 1)
+            }.to raise_error(ArgumentError)
+          end
+        end
+      end
+    end
+  end
+end

data/spec/words_counted_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+require_relative "spec_helper"
+describe WordsCounted do
+  describe ".from_file" do
+    let(:file_path) { "spec/support/the_hart_and_the_hunter.txt" }
+    it "opens and reads a text file" do
+      counter = WordsCounted.from_file(file_path)
+      expect(counter.token_count).to eq(139)
+    end
+    it "opens and reads a text file with options" do
+      counter = WordsCounted.from_file(file_path, exclude: "hunter")
+      expect(counter.token_count).to eq(135)
+    end
+  end
+  describe ".count" do
+    let(:string) do
+      "We are all in the gutter, but some of us are looking at the stars."
+    end
+    it "returns a counter instance with given input as tokens" do
+      counter = WordsCounted.count(string)
+      expect(counter.token_count).to eq(15)
+    end
+    it "returns a counter instance with given input and options" do
+      counter = WordsCounted.count(string, exclude: "the gutter")
+      expect(counter.token_count).to eq(12)
+    end
+  end
+end

data/words_counted.gemspec CHANGED Viewed

@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
   spec.version       = WordsCounted::VERSION
   spec.authors       = ["Mohamad El-Husseini"]
   spec.email         = ["husseini.mel@gmail.com"]
-  spec.description   = %q{A Ruby word counter and string analyser with helpful utility methods.}
+  spec.description   = %q{A Ruby natural language processor to extract stats from text, such was word count and more.}
   spec.summary       = %q{See README.}
   spec.homepage      = "https://github.com/abitdodgy/words_counted"
   spec.license       = "MIT"
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "bundler"
   spec.add_development_dependency "rake"
   spec.add_development_dependency "rspec"
   spec.add_development_dependency "pry"

metadata CHANGED Viewed

@@ -1,29 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: words_counted
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 1.0.3
 platform: ruby
 authors:
 - Mohamad El-Husseini
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-12-02 00:00:00.000000000 Z
+date: 2021-10-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -66,7 +66,8 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description: A Ruby word counter and string analyser with helpful utility methods.
+description: A Ruby natural language processor to extract stats from text, such was
+  word count and more.
 email:
 - husseini.mel@gmail.com
 executables: []
@@ -74,25 +75,35 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".hound.yml"
 - ".rspec"
+- ".ruby-style.yml"
+- ".ruby-version"
+- ".travis.yml"
 - ".yardopts"
 - CHANGELOG.md
 - Gemfile
 - LICENSE.txt
 - README.md
 - Rakefile
+- lib/refinements/hash_refinements.rb
 - lib/words_counted.rb
 - lib/words_counted/counter.rb
+- lib/words_counted/deprecated.rb
+- lib/words_counted/tokeniser.rb
 - lib/words_counted/version.rb
 - spec/spec_helper.rb
 - spec/support/the_hart_and_the_hunter.txt
 - spec/words_counted/counter_spec.rb
+- spec/words_counted/deprecated_spec.rb
+- spec/words_counted/tokeniser_spec.rb
+- spec/words_counted_spec.rb
 - words_counted.gemspec
 homepage: https://github.com/abitdodgy/words_counted
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -107,12 +118,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.2.2
-signing_key:
+rubygems_version: 3.2.15
+signing_key:
 specification_version: 4
 summary: See README.
 test_files:
 - spec/spec_helper.rb
 - spec/support/the_hart_and_the_hunter.txt
 - spec/words_counted/counter_spec.rb
+- spec/words_counted/deprecated_spec.rb
+- spec/words_counted/tokeniser_spec.rb
+- spec/words_counted_spec.rb