RubyGems - words_counted - Versions diffs - 0.1.5 → 1.0.3 - Mend

words_counted 0.1.5 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.hound.yml +2 -0
data/.ruby-style.yml +2 -0
data/.ruby-version +1 -0
data/.travis.yml +9 -0
data/.yardopts +3 -2
data/CHANGELOG.md +29 -0
data/README.md +146 -189
data/lib/refinements/hash_refinements.rb +14 -0
data/lib/words_counted/counter.rb +113 -72
data/lib/words_counted/deprecated.rb +78 -0
data/lib/words_counted/tokeniser.rb +163 -0
data/lib/words_counted/version.rb +1 -1
data/lib/words_counted.rb +31 -4
data/spec/words_counted/counter_spec.rb +49 -204
data/spec/words_counted/deprecated_spec.rb +99 -0
data/spec/words_counted/tokeniser_spec.rb +133 -0
data/spec/words_counted_spec.rb +34 -0
data/words_counted.gemspec +2 -2
metadata +25 -12

data/lib/words_counted/counter.rb CHANGED Viewed

@@ -1,96 +1,137 @@
 # -*- encoding : utf-8 -*-
 module WordsCounted
-  class Counter
-    attr_reader :words, :word_occurrences, :word_lengths, :char_count
-    WORD_REGEXP = /[\p{Alpha}\-']+/
-    def self.from_file(path, options = {})
-      File.open(path) do |file|
-        new file.read, options
-      end
-    end
-    def initialize(string, options = {})
-      @options = options
-      exclude = filter_proc(options[:exclude])
-      @words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
-      @char_count = words.join.size
-      @word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
-      @word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
-    end
-    def word_count
-      words.size
-    end
-    def unique_word_count
-      words.uniq.size
-    end
+  using Refinements::HashRefinements
-    def average_chars_per_word(precision = 2)
-      (char_count / word_count.to_f).round(precision)
-    end
-    def most_occurring_words
-      highest_ranking word_occurrences
+  class Counter
+    # This module contains several methods to extract useful statistics
+    # from any array of tokens, such as density, frequency, and more.
+    #
+    # @example
+    #  WordsCounted::Counter.new(["hello", "world"]).token_count
+    #  # => 2
+    include Deprecated
+    # @return [Array<String>] an array of tokens.
+    attr_reader :tokens
+    # Initializes state with an array of tokens.
+    #
+    # @param [Array] An array of tokens to perform operations on
+    def initialize(tokens)
+      @tokens = tokens
     end
-    def longest_words
-      highest_ranking word_lengths
+    # Returns the number of tokens.
+    #
+    # @example
+    #  Counter.new(%w[one two two three three three]).token_count
+    #  # => 6
+    #
+    # @return [Integer] The number of tokens
+    def token_count
+      tokens.size
     end
-    def word_density(precision = 2)
-      word_densities = word_occurrences.each_with_object({}) do |(word, occ), hash|
-        hash[word] = (occ / word_count.to_f * 100).round(precision)
-      end
-      sort_by_descending_value word_densities
+    # Returns the number of unique tokens.
+    #
+    # @example
+    #  Counter.new(%w[one two two three three three]).uniq_token_count
+    #  # => 3
+    #
+    # @return [Integer] The number of unique tokens
+    def uniq_token_count
+      tokens.uniq.size
     end
-    def sorted_word_occurrences
-      sort_by_descending_value word_occurrences
+    # Returns the character count of all tokens.
+    #
+    # @example
+    #  Counter.new(%w[one two]).char_count
+    #  # => 6
+    #
+    # @return [Integer] The total char count of tokens
+    def char_count
+      tokens.join.size
     end
-    def sorted_word_lengths
-      sort_by_descending_value word_lengths
+    # Returns a sorted two-dimensional array where each member array is a token and its frequency.
+    # The array is sorted by frequency in descending order.
+    #
+    # @example
+    #  Counter.new(%w[one two two three three three]).token_frequency
+    #  # => [ ['three', 3], ['two', 2], ['one', 1] ]
+    #
+    # @return [Array<Array<String, Integer>>] An array of tokens and their frequencies
+    def token_frequency
+      tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
     end
-    def count(match)
-      words.select { |word| word == match.downcase }.size
+    # Returns a sorted two-dimensional array where each member array is a token and its length.
+    # The array is sorted by length in descending order.
+    #
+    # @example
+    #  Counter.new(%w[one two three four five]).token_lenghts
+    #  # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
+    #
+    # @return [Array<Array<String, Integer>>] An array of tokens and their lengths
+    def token_lengths
+      tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
     end
-  private
-    def highest_ranking(entries)
-      entries.group_by { |_, value| value }.sort.last.last
+    # Returns a sorted two-dimensional array where each member array is a token and its density
+    # as a float, rounded to a precision of two decimal places. It accepts a precision argument
+    # which defaults to `2`.
+    #
+    # @example
+    #  Counter.new(%w[Maj. Major Major Major]).token_density
+    #  # => [ ['major', .75], ['maj', .25] ]
+    #
+    # @example with `precision`
+    #  Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
+    #  # => [ ['major', .7500], ['maj', .2500] ]
+    #
+    # @param [Integer] precision    The number of decimal places to round density to
+    # @return [Array<Array<String, Float>>] An array of tokens and their densities
+    def token_density(precision: 2)
+      token_frequency.each_with_object({}) { |(token, freq), hash|
+        hash[token] = (freq / token_count.to_f).round(precision)
+      }.sort_by_value_desc
     end
-    def sort_by_descending_value(entries)
-      entries.sort_by { |_, value| value }.reverse
+    # Returns a hash of tokens and their frequencies for tokens with the highest frequency.
+    #
+    # @example
+    #  Counter.new(%w[one once two two twice twice]).most_frequent_tokens
+    #  # => { 'two' => 2, 'twice' => 2 }
+    #
+    # @return [Hash{String => Integer}] A hash of tokens and their frequencies
+    def most_frequent_tokens
+      token_frequency.group_by(&:last).max.last.to_h
     end
-    def regexp
-      @options[:regexp] || WORD_REGEXP
+    # Returns a hash of tokens and their lengths for tokens with the highest length
+    #
+    # @example
+    #  Counter.new(%w[one three five seven]).longest_tokens
+    #  # => { 'three' => 5, 'seven' => 5 }
+    #
+    # @return [Hash{String => Integer}] A hash of tokens and their lengths
+    def longest_tokens
+      token_lengths.group_by(&:last).max.last.to_h
     end
-    def filter_proc(filter)
-      if filter.respond_to?(:to_a)
-        filter_procs = Array(filter).map(&method(:filter_proc))
-        ->(word) {
-          filter_procs.any? { |p| p.call(word) }
-        }
-      elsif filter.respond_to?(:to_str)
-        exclusion_list = filter.split.collect(&:downcase)
-        ->(word) {
-          exclusion_list.include?(word)
-        }
-      elsif regexp_filter = Regexp.try_convert(filter)
-        Proc.new { |word| word =~ regexp_filter }
-      elsif filter.respond_to?(:to_proc)
-        filter.to_proc
-      else
-        raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
-      end
+    # Returns the average char count per token rounded to a precision of two decimal places.
+    # Accepts a `precision` argument.
+    #
+    # @example
+    #  Counter.new(%w[one three five seven]).average_chars_per_token
+    #  # => 4.25
+    #
+    # @param [Integer] precision    The number of decimal places to round average char count to
+    # @return [Float] The average char count per token
+    def average_chars_per_token(precision: 2)
+      (char_count / token_count.to_f).round(precision)
     end
   end
 end

data/lib/words_counted/deprecated.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# -*- encoding : utf-8 -*-
+module WordsCounted
+  module Deprecated
+    # The following methods are deprecated and will be removed in version 1.1.0.
+    # @deprecated use `Counter#token_count`
+    def word_count
+      warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
+      token_count
+    end
+    # @deprecated use `Counter#uniq_token_count`
+    def unique_word_count
+      warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
+      uniq_token_count
+    end
+    # @deprecated use `Counter#token_frequency`
+    def word_occurrences
+      warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
+      warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
+      token_frequency.to_h
+    end
+    # @deprecated use `Counter#token_lengths`
+    def word_lengths
+      warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
+      warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
+      token_lengths.to_h
+    end
+    # @deprecated use `Counter#token_density`
+    def word_density(precision = 2)
+      warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
+      warn "`Counter#token_density` returns density as decimal and not percent"
+      token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
+    end
+    # @deprecated use `Counter#token_frequency`
+    def sorted_word_occurrences
+      warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
+      token_frequency
+    end
+    # @deprecated use `Counter#token_lengths`
+    def sorted_word_lengths
+      warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
+      token_lengths
+    end
+    # @deprecated use `Counter#most_frequent_tokens`
+    def most_occurring_words
+      warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
+      warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
+      most_frequent_tokens.to_a
+    end
+    # @deprecated use `Counter#longest_tokens`
+    def longest_words
+      warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
+      warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
+      longest_tokens.to_a
+    end
+    # @deprecated use `Counter#average_chars_per_token`
+    def average_chars_per_word(precision = 2)
+      warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
+      average_chars_per_token(precision: precision)
+    end
+    # @deprecated use `Counter#average_chars_per_token`
+    def count(token)
+      warn "`Counter#count` is deprecated, please use `Array#count`"
+      tokens.count(token.downcase)
+    end
+  end
+end

data/lib/words_counted/tokeniser.rb ADDED Viewed

@@ -0,0 +1,163 @@
+# -*- encoding : utf-8 -*-
+module WordsCounted
+  class Tokeniser
+    # Takes a string and breaks it into an array of tokens.
+    # Using `pattern` and `exclude` allows for powerful tokenisation strategies.
+    #
+    # @example
+    #  tokeniser
+    #    = WordsCounted::Tokeniser.new(
+    #        "We are all in the gutter, but some of us are looking at the stars."
+    #      )
+    #  tokeniser.tokenise(exclude: "We are all in the gutter")
+    #  # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
+    # Default tokenisation strategy
+    TOKEN_REGEXP = /[\p{Alpha}\-']+/
+    # Initialises state with the string to be tokenised.
+    #
+    # @param [String] input   The string to tokenise
+    def initialize(input)
+      @input = input
+    end
+    # Converts a string into an array of tokens using a regular expression.
+    # If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
+    #
+    # Use `exclude` to remove tokens from the final list. `exclude` can be a string,
+    # a regular expression, a lambda, a symbol, or an array of one or more of those types.
+    # This allows for powerful and flexible tokenisation strategies.
+    #
+    # If a symbol is passed, it must name a predicate method.
+    #
+    # @example
+    #  WordsCounted::Tokeniser.new("Hello World").tokenise
+    #  # => ['hello', 'world']
+    #
+    # @example With `pattern`
+    #  WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
+    #  # => ['hello', 'mohamad']
+    #
+    # @example With `exclude` as a string
+    #  WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
+    #  # => ['sami']
+    #
+    # @example With `exclude` as a regexp
+    #  WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
+    #  # => ['dani']
+    #
+    # @example With `exclude` as a lambda
+    #  WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
+    #    exclude: ->(token) { token.length > 6 }
+    #  )
+    #  # => ['sami']
+    #
+    # @example With `exclude` as a symbol
+    #  WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
+    #  # => ['محمد']
+    #
+    # @example With `exclude` as an array of strings
+    #  WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
+    #    exclude: ["goodbye hello"]
+    #  )
+    #  # => ['sami', 'and', dani']
+    #
+    # @example With `exclude` as an array of regular expressions
+    #  WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
+    #    exclude: [/goodbye/i, /and/i]
+    #  )
+    #  # => ['hello', 'dani']
+    #
+    # @example With `exclude` as an array of lambdas
+    #  t = WordsCounted::Tokeniser.new("Special Agent 007")
+    #  t.tokenise(
+    #    exclude: [
+    #      ->(t) { t.to_i.odd? },
+    #      ->(t) { t.length > 5}
+    #    ]
+    #  )
+    #  # => ['agent']
+    #
+    # @example With `exclude` as a mixed array
+    #  t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
+    #  t.tokenise(
+    #    exclude: [
+    #      :ascii_only?,
+    #      /محمد/,
+    #      ->(t) { t.length > 6},
+    #      "و"
+    #    ]
+    #  )
+    #  # => ["هي", "سامي", "وداني"]
+    #
+    # @param [Regexp] pattern   The string to tokenise
+    # @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude     The filter to apply
+    # @return [Array] The array of filtered tokens
+    def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
+      filter_proc = filter_to_proc(exclude)
+      @input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
+    end
+  private
+    # The following methods convert any arguments into a callable object. The return value of this
+    # lambda is then used to determine whether a token should be excluded from the final list.
+    #
+    # `filter` can be a string, a regular expression, a lambda, a symbol, or an array
+    # of any combination of those types.
+    #
+    # If `filter` is a string, it converts the string into an array, and returns a lambda
+    # that returns true if the token is included in the resulting array.
+    #
+    # @see {Tokeniser#filter_proc_from_string}.
+    #
+    # If `filter` is a an array, it creates a new array where each element of the origingal is
+    # converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
+    # If any lambda returns true the token is excluded from the final list.
+    #
+    # @see {Tokeniser#filter_procs_from_array}.
+    #
+    # If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
+    # is returned that checks the token for a match.
+    #
+    # If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
+    #
+    # This method depends on `nil` responding `to_a` with an empty array, which
+    # avoids having to check if `exclude` was passed.
+    # @api private
+    def filter_to_proc(filter)
+      if filter.respond_to?(:to_a)
+        filter_procs_from_array(filter)
+      elsif filter.respond_to?(:to_str)
+        filter_proc_from_string(filter)
+      elsif regexp_filter = Regexp.try_convert(filter)
+        ->(token) {
+          token =~ regexp_filter
+        }
+      elsif filter.respond_to?(:to_proc)
+        filter.to_proc
+      else
+        raise ArgumentError,
+          "`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
+      end
+    end
+    # @api private
+    def filter_procs_from_array(filter)
+      filter_procs = Array(filter).map &method(:filter_to_proc)
+      ->(token) {
+        filter_procs.any? { |pro| pro.call(token) }
+      }
+    end
+    # @api private
+    def filter_proc_from_string(filter)
+      normalized_exclusion_list = filter.split.map(&:downcase)
+      ->(token) {
+        normalized_exclusion_list.include?(token)
+      }
+    end
+  end
+end

data/lib/words_counted/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 # -*- encoding : utf-8 -*-
 module WordsCounted
-  VERSION = "0.1.5"
+  VERSION = "1.0.3"
 end

data/lib/words_counted.rb CHANGED Viewed

@@ -1,6 +1,11 @@
 # -*- encoding : utf-8 -*-
-require "words_counted/version"
+require "refinements/hash_refinements"
+require "words_counted/deprecated"
+require "words_counted/tokeniser"
 require "words_counted/counter"
+require "words_counted/version"
 begin
   require "pry"
@@ -8,11 +13,33 @@ rescue LoadError
 end
 module WordsCounted
-  def self.count(string, options = {})
-    Counter.new(string, options)
+  # Takes a string, tokenises it, and returns an instance of Counter
+  # with the resulting tokens.
+  #
+  # @see Tokeniser.tokenise
+  # @see Counter.initialize
+  #
+  # @param [String] input   The input to be tokenised
+  # @param [Hash] options   The options to pass onto `Counter`
+  # @return [WordsCounted::Counter] An instance of Counter
+  def self.count(input, options = {})
+    tokens = Tokeniser.new(input).tokenise(**options)
+    Counter.new(tokens)
   end
+  # Takes a file path, reads the file and tokenises its contents,
+  # and returns an instance of Counter with the resulting tokens.
+  #
+  # @see Tokeniser.tokenise
+  # @see Counter.initialize
+  #
+  # @param [String] path    The file to be read and tokenised
+  # @param [Hash] options   The options to pass onto `Counter`
+  # @return [WordsCounted::Counter] An instance of Counter
   def self.from_file(path, options = {})
-    Counter.from_file(path, options)
+    tokens = File.open(path) do |file|
+      Tokeniser.new(file.read).tokenise(**options)
+    end
+    Counter.new(tokens)
   end
 end