words_counted 0.1.5 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,96 +1,137 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- class Counter
4
- attr_reader :words, :word_occurrences, :word_lengths, :char_count
5
-
6
- WORD_REGEXP = /[\p{Alpha}\-']+/
7
-
8
- def self.from_file(path, options = {})
9
- File.open(path) do |file|
10
- new file.read, options
11
- end
12
- end
13
-
14
- def initialize(string, options = {})
15
- @options = options
16
- exclude = filter_proc(options[:exclude])
17
- @words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
18
- @char_count = words.join.size
19
- @word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
20
- @word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
21
- end
22
-
23
- def word_count
24
- words.size
25
- end
26
-
27
- def unique_word_count
28
- words.uniq.size
29
- end
3
+ using Refinements::HashRefinements
30
4
 
31
- def average_chars_per_word(precision = 2)
32
- (char_count / word_count.to_f).round(precision)
33
- end
34
-
35
- def most_occurring_words
36
- highest_ranking word_occurrences
5
+ class Counter
6
+ # This module contains several methods to extract useful statistics
7
+ # from any array of tokens, such as density, frequency, and more.
8
+ #
9
+ # @example
10
+ # WordsCounted::Counter.new(["hello", "world"]).token_count
11
+ # # => 2
12
+
13
+ include Deprecated
14
+
15
+ # @return [Array<String>] an array of tokens.
16
+ attr_reader :tokens
17
+
18
+ # Initializes state with an array of tokens.
19
+ #
20
+ # @param [Array] An array of tokens to perform operations on
21
+ def initialize(tokens)
22
+ @tokens = tokens
37
23
  end
38
24
 
39
- def longest_words
40
- highest_ranking word_lengths
25
+ # Returns the number of tokens.
26
+ #
27
+ # @example
28
+ # Counter.new(%w[one two two three three three]).token_count
29
+ # # => 6
30
+ #
31
+ # @return [Integer] The number of tokens
32
+ def token_count
33
+ tokens.size
41
34
  end
42
35
 
43
- def word_density(precision = 2)
44
- word_densities = word_occurrences.each_with_object({}) do |(word, occ), hash|
45
- hash[word] = (occ / word_count.to_f * 100).round(precision)
46
- end
47
- sort_by_descending_value word_densities
36
+ # Returns the number of unique tokens.
37
+ #
38
+ # @example
39
+ # Counter.new(%w[one two two three three three]).uniq_token_count
40
+ # # => 3
41
+ #
42
+ # @return [Integer] The number of unique tokens
43
+ def uniq_token_count
44
+ tokens.uniq.size
48
45
  end
49
46
 
50
- def sorted_word_occurrences
51
- sort_by_descending_value word_occurrences
47
+ # Returns the character count of all tokens.
48
+ #
49
+ # @example
50
+ # Counter.new(%w[one two]).char_count
51
+ # # => 6
52
+ #
53
+ # @return [Integer] The total char count of tokens
54
+ def char_count
55
+ tokens.join.size
52
56
  end
53
57
 
54
- def sorted_word_lengths
55
- sort_by_descending_value word_lengths
58
+ # Returns a sorted two-dimensional array where each member array is a token and its frequency.
59
+ # The array is sorted by frequency in descending order.
60
+ #
61
+ # @example
62
+ # Counter.new(%w[one two two three three three]).token_frequency
63
+ # # => [ ['three', 3], ['two', 2], ['one', 1] ]
64
+ #
65
+ # @return [Array<Array<String, Integer>>] An array of tokens and their frequencies
66
+ def token_frequency
67
+ tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
56
68
  end
57
69
 
58
- def count(match)
59
- words.select { |word| word == match.downcase }.size
70
+ # Returns a sorted two-dimensional array where each member array is a token and its length.
71
+ # The array is sorted by length in descending order.
72
+ #
73
+ # @example
74
+ # Counter.new(%w[one two three four five]).token_lenghts
75
+ # # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
76
+ #
77
+ # @return [Array<Array<String, Integer>>] An array of tokens and their lengths
78
+ def token_lengths
79
+ tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
60
80
  end
61
81
 
62
- private
63
-
64
- def highest_ranking(entries)
65
- entries.group_by { |_, value| value }.sort.last.last
82
+ # Returns a sorted two-dimensional array where each member array is a token and its density
83
+ # as a float, rounded to a precision of two decimal places. It accepts a precision argument
84
+ # which defaults to `2`.
85
+ #
86
+ # @example
87
+ # Counter.new(%w[Maj. Major Major Major]).token_density
88
+ # # => [ ['major', .75], ['maj', .25] ]
89
+ #
90
+ # @example with `precision`
91
+ # Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
92
+ # # => [ ['major', .7500], ['maj', .2500] ]
93
+ #
94
+ # @param [Integer] precision The number of decimal places to round density to
95
+ # @return [Array<Array<String, Float>>] An array of tokens and their densities
96
+ def token_density(precision: 2)
97
+ token_frequency.each_with_object({}) { |(token, freq), hash|
98
+ hash[token] = (freq / token_count.to_f).round(precision)
99
+ }.sort_by_value_desc
66
100
  end
67
101
 
68
- def sort_by_descending_value(entries)
69
- entries.sort_by { |_, value| value }.reverse
102
+ # Returns a hash of tokens and their frequencies for tokens with the highest frequency.
103
+ #
104
+ # @example
105
+ # Counter.new(%w[one once two two twice twice]).most_frequent_tokens
106
+ # # => { 'two' => 2, 'twice' => 2 }
107
+ #
108
+ # @return [Hash{String => Integer}] A hash of tokens and their frequencies
109
+ def most_frequent_tokens
110
+ token_frequency.group_by(&:last).max.last.to_h
70
111
  end
71
112
 
72
- def regexp
73
- @options[:regexp] || WORD_REGEXP
113
+ # Returns a hash of tokens and their lengths for tokens with the highest length
114
+ #
115
+ # @example
116
+ # Counter.new(%w[one three five seven]).longest_tokens
117
+ # # => { 'three' => 5, 'seven' => 5 }
118
+ #
119
+ # @return [Hash{String => Integer}] A hash of tokens and their lengths
120
+ def longest_tokens
121
+ token_lengths.group_by(&:last).max.last.to_h
74
122
  end
75
123
 
76
- def filter_proc(filter)
77
- if filter.respond_to?(:to_a)
78
- filter_procs = Array(filter).map(&method(:filter_proc))
79
- ->(word) {
80
- filter_procs.any? { |p| p.call(word) }
81
- }
82
- elsif filter.respond_to?(:to_str)
83
- exclusion_list = filter.split.collect(&:downcase)
84
- ->(word) {
85
- exclusion_list.include?(word)
86
- }
87
- elsif regexp_filter = Regexp.try_convert(filter)
88
- Proc.new { |word| word =~ regexp_filter }
89
- elsif filter.respond_to?(:to_proc)
90
- filter.to_proc
91
- else
92
- raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
93
- end
124
+ # Returns the average char count per token rounded to a precision of two decimal places.
125
+ # Accepts a `precision` argument.
126
+ #
127
+ # @example
128
+ # Counter.new(%w[one three five seven]).average_chars_per_token
129
+ # # => 4.25
130
+ #
131
+ # @param [Integer] precision The number of decimal places to round average char count to
132
+ # @return [Float] The average char count per token
133
+ def average_chars_per_token(precision: 2)
134
+ (char_count / token_count.to_f).round(precision)
94
135
  end
95
136
  end
96
137
  end
@@ -0,0 +1,78 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ module Deprecated
4
+ # The following methods are deprecated and will be removed in version 1.1.0.
5
+
6
+ # @deprecated use `Counter#token_count`
7
+ def word_count
8
+ warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
9
+ token_count
10
+ end
11
+
12
+ # @deprecated use `Counter#uniq_token_count`
13
+ def unique_word_count
14
+ warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
15
+ uniq_token_count
16
+ end
17
+
18
+ # @deprecated use `Counter#token_frequency`
19
+ def word_occurrences
20
+ warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
21
+ warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
22
+ token_frequency.to_h
23
+ end
24
+
25
+ # @deprecated use `Counter#token_lengths`
26
+ def word_lengths
27
+ warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
28
+ warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
29
+ token_lengths.to_h
30
+ end
31
+
32
+ # @deprecated use `Counter#token_density`
33
+ def word_density(precision = 2)
34
+ warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
35
+ warn "`Counter#token_density` returns density as decimal and not percent"
36
+
37
+ token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
38
+ end
39
+
40
+ # @deprecated use `Counter#token_frequency`
41
+ def sorted_word_occurrences
42
+ warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
43
+ token_frequency
44
+ end
45
+
46
+ # @deprecated use `Counter#token_lengths`
47
+ def sorted_word_lengths
48
+ warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
49
+ token_lengths
50
+ end
51
+
52
+ # @deprecated use `Counter#most_frequent_tokens`
53
+ def most_occurring_words
54
+ warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
55
+ warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
56
+ most_frequent_tokens.to_a
57
+ end
58
+
59
+ # @deprecated use `Counter#longest_tokens`
60
+ def longest_words
61
+ warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
62
+ warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
63
+ longest_tokens.to_a
64
+ end
65
+
66
+ # @deprecated use `Counter#average_chars_per_token`
67
+ def average_chars_per_word(precision = 2)
68
+ warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
69
+ average_chars_per_token(precision: precision)
70
+ end
71
+
72
+ # @deprecated use `Counter#average_chars_per_token`
73
+ def count(token)
74
+ warn "`Counter#count` is deprecated, please use `Array#count`"
75
+ tokens.count(token.downcase)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,163 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ class Tokeniser
4
+ # Takes a string and breaks it into an array of tokens.
5
+ # Using `pattern` and `exclude` allows for powerful tokenisation strategies.
6
+ #
7
+ # @example
8
+ # tokeniser
9
+ # = WordsCounted::Tokeniser.new(
10
+ # "We are all in the gutter, but some of us are looking at the stars."
11
+ # )
12
+ # tokeniser.tokenise(exclude: "We are all in the gutter")
13
+ # # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
14
+
15
+ # Default tokenisation strategy
16
+ TOKEN_REGEXP = /[\p{Alpha}\-']+/
17
+
18
+ # Initialises state with the string to be tokenised.
19
+ #
20
+ # @param [String] input The string to tokenise
21
+ def initialize(input)
22
+ @input = input
23
+ end
24
+
25
+ # Converts a string into an array of tokens using a regular expression.
26
+ # If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
27
+ #
28
+ # Use `exclude` to remove tokens from the final list. `exclude` can be a string,
29
+ # a regular expression, a lambda, a symbol, or an array of one or more of those types.
30
+ # This allows for powerful and flexible tokenisation strategies.
31
+ #
32
+ # If a symbol is passed, it must name a predicate method.
33
+ #
34
+ # @example
35
+ # WordsCounted::Tokeniser.new("Hello World").tokenise
36
+ # # => ['hello', 'world']
37
+ #
38
+ # @example With `pattern`
39
+ # WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
40
+ # # => ['hello', 'mohamad']
41
+ #
42
+ # @example With `exclude` as a string
43
+ # WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
44
+ # # => ['sami']
45
+ #
46
+ # @example With `exclude` as a regexp
47
+ # WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
48
+ # # => ['dani']
49
+ #
50
+ # @example With `exclude` as a lambda
51
+ # WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
52
+ # exclude: ->(token) { token.length > 6 }
53
+ # )
54
+ # # => ['sami']
55
+ #
56
+ # @example With `exclude` as a symbol
57
+ # WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
58
+ # # => ['محمد']
59
+ #
60
+ # @example With `exclude` as an array of strings
61
+ # WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
62
+ # exclude: ["goodbye hello"]
63
+ # )
64
+ # # => ['sami', 'and', dani']
65
+ #
66
+ # @example With `exclude` as an array of regular expressions
67
+ # WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
68
+ # exclude: [/goodbye/i, /and/i]
69
+ # )
70
+ # # => ['hello', 'dani']
71
+ #
72
+ # @example With `exclude` as an array of lambdas
73
+ # t = WordsCounted::Tokeniser.new("Special Agent 007")
74
+ # t.tokenise(
75
+ # exclude: [
76
+ # ->(t) { t.to_i.odd? },
77
+ # ->(t) { t.length > 5}
78
+ # ]
79
+ # )
80
+ # # => ['agent']
81
+ #
82
+ # @example With `exclude` as a mixed array
83
+ # t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
84
+ # t.tokenise(
85
+ # exclude: [
86
+ # :ascii_only?,
87
+ # /محمد/,
88
+ # ->(t) { t.length > 6},
89
+ # "و"
90
+ # ]
91
+ # )
92
+ # # => ["هي", "سامي", "وداني"]
93
+ #
94
+ # @param [Regexp] pattern The string to tokenise
95
+ # @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude The filter to apply
96
+ # @return [Array] The array of filtered tokens
97
+ def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
98
+ filter_proc = filter_to_proc(exclude)
99
+ @input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
100
+ end
101
+
102
+ private
103
+
104
+ # The following methods convert any arguments into a callable object. The return value of this
105
+ # lambda is then used to determine whether a token should be excluded from the final list.
106
+ #
107
+ # `filter` can be a string, a regular expression, a lambda, a symbol, or an array
108
+ # of any combination of those types.
109
+ #
110
+ # If `filter` is a string, it converts the string into an array, and returns a lambda
111
+ # that returns true if the token is included in the resulting array.
112
+ #
113
+ # @see {Tokeniser#filter_proc_from_string}.
114
+ #
115
+ # If `filter` is a an array, it creates a new array where each element of the origingal is
116
+ # converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
117
+ # If any lambda returns true the token is excluded from the final list.
118
+ #
119
+ # @see {Tokeniser#filter_procs_from_array}.
120
+ #
121
+ # If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
122
+ # is returned that checks the token for a match.
123
+ #
124
+ # If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
125
+ #
126
+ # This method depends on `nil` responding `to_a` with an empty array, which
127
+ # avoids having to check if `exclude` was passed.
128
+
129
+ # @api private
130
+ def filter_to_proc(filter)
131
+ if filter.respond_to?(:to_a)
132
+ filter_procs_from_array(filter)
133
+ elsif filter.respond_to?(:to_str)
134
+ filter_proc_from_string(filter)
135
+ elsif regexp_filter = Regexp.try_convert(filter)
136
+ ->(token) {
137
+ token =~ regexp_filter
138
+ }
139
+ elsif filter.respond_to?(:to_proc)
140
+ filter.to_proc
141
+ else
142
+ raise ArgumentError,
143
+ "`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
144
+ end
145
+ end
146
+
147
+ # @api private
148
+ def filter_procs_from_array(filter)
149
+ filter_procs = Array(filter).map &method(:filter_to_proc)
150
+ ->(token) {
151
+ filter_procs.any? { |pro| pro.call(token) }
152
+ }
153
+ end
154
+
155
+ # @api private
156
+ def filter_proc_from_string(filter)
157
+ normalized_exclusion_list = filter.split.map(&:downcase)
158
+ ->(token) {
159
+ normalized_exclusion_list.include?(token)
160
+ }
161
+ end
162
+ end
163
+ end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- VERSION = "0.1.5"
3
+ VERSION = "1.0.3"
4
4
  end
data/lib/words_counted.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require "words_counted/version"
2
+ require "refinements/hash_refinements"
3
+
4
+ require "words_counted/deprecated"
5
+
6
+ require "words_counted/tokeniser"
3
7
  require "words_counted/counter"
8
+ require "words_counted/version"
4
9
 
5
10
  begin
6
11
  require "pry"
@@ -8,11 +13,33 @@ rescue LoadError
8
13
  end
9
14
 
10
15
  module WordsCounted
11
- def self.count(string, options = {})
12
- Counter.new(string, options)
16
+ # Takes a string, tokenises it, and returns an instance of Counter
17
+ # with the resulting tokens.
18
+ #
19
+ # @see Tokeniser.tokenise
20
+ # @see Counter.initialize
21
+ #
22
+ # @param [String] input The input to be tokenised
23
+ # @param [Hash] options The options to pass onto `Counter`
24
+ # @return [WordsCounted::Counter] An instance of Counter
25
+ def self.count(input, options = {})
26
+ tokens = Tokeniser.new(input).tokenise(**options)
27
+ Counter.new(tokens)
13
28
  end
14
29
 
30
+ # Takes a file path, reads the file and tokenises its contents,
31
+ # and returns an instance of Counter with the resulting tokens.
32
+ #
33
+ # @see Tokeniser.tokenise
34
+ # @see Counter.initialize
35
+ #
36
+ # @param [String] path The file to be read and tokenised
37
+ # @param [Hash] options The options to pass onto `Counter`
38
+ # @return [WordsCounted::Counter] An instance of Counter
15
39
  def self.from_file(path, options = {})
16
- Counter.from_file(path, options)
40
+ tokens = File.open(path) do |file|
41
+ Tokeniser.new(file.read).tokenise(**options)
42
+ end
43
+ Counter.new(tokens)
17
44
  end
18
45
  end