words_counted 0.1.5 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,96 +1,137 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- class Counter
4
- attr_reader :words, :word_occurrences, :word_lengths, :char_count
5
-
6
- WORD_REGEXP = /[\p{Alpha}\-']+/
7
-
8
- def self.from_file(path, options = {})
9
- File.open(path) do |file|
10
- new file.read, options
11
- end
12
- end
13
-
14
- def initialize(string, options = {})
15
- @options = options
16
- exclude = filter_proc(options[:exclude])
17
- @words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
18
- @char_count = words.join.size
19
- @word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
20
- @word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
21
- end
22
-
23
- def word_count
24
- words.size
25
- end
26
-
27
- def unique_word_count
28
- words.uniq.size
29
- end
3
+ using Refinements::HashRefinements
30
4
 
31
- def average_chars_per_word(precision = 2)
32
- (char_count / word_count.to_f).round(precision)
33
- end
34
-
35
- def most_occurring_words
36
- highest_ranking word_occurrences
5
+ class Counter
6
+ # This module contains several methods to extract useful statistics
7
+ # from any array of tokens, such as density, frequency, and more.
8
+ #
9
+ # @example
10
+ # WordsCounted::Counter.new(["hello", "world"]).token_count
11
+ # # => 2
12
+
13
+ include Deprecated
14
+
15
+ # @return [Array<String>] an array of tokens.
16
+ attr_reader :tokens
17
+
18
+ # Initializes state with an array of tokens.
19
+ #
20
+ # @param [Array] An array of tokens to perform operations on
21
+ def initialize(tokens)
22
+ @tokens = tokens
37
23
  end
38
24
 
39
- def longest_words
40
- highest_ranking word_lengths
25
+ # Returns the number of tokens.
26
+ #
27
+ # @example
28
+ # Counter.new(%w[one two two three three three]).token_count
29
+ # # => 6
30
+ #
31
+ # @return [Integer] The number of tokens
32
+ def token_count
33
+ tokens.size
41
34
  end
42
35
 
43
- def word_density(precision = 2)
44
- word_densities = word_occurrences.each_with_object({}) do |(word, occ), hash|
45
- hash[word] = (occ / word_count.to_f * 100).round(precision)
46
- end
47
- sort_by_descending_value word_densities
36
+ # Returns the number of unique tokens.
37
+ #
38
+ # @example
39
+ # Counter.new(%w[one two two three three three]).uniq_token_count
40
+ # # => 3
41
+ #
42
+ # @return [Integer] The number of unique tokens
43
+ def uniq_token_count
44
+ tokens.uniq.size
48
45
  end
49
46
 
50
- def sorted_word_occurrences
51
- sort_by_descending_value word_occurrences
47
+ # Returns the character count of all tokens.
48
+ #
49
+ # @example
50
+ # Counter.new(%w[one two]).char_count
51
+ # # => 6
52
+ #
53
+ # @return [Integer] The total char count of tokens
54
+ def char_count
55
+ tokens.join.size
52
56
  end
53
57
 
54
- def sorted_word_lengths
55
- sort_by_descending_value word_lengths
58
+ # Returns a sorted two-dimensional array where each member array is a token and its frequency.
59
+ # The array is sorted by frequency in descending order.
60
+ #
61
+ # @example
62
+ # Counter.new(%w[one two two three three three]).token_frequency
63
+ # # => [ ['three', 3], ['two', 2], ['one', 1] ]
64
+ #
65
+ # @return [Array<Array<String, Integer>>] An array of tokens and their frequencies
66
+ def token_frequency
67
+ tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
56
68
  end
57
69
 
58
- def count(match)
59
- words.select { |word| word == match.downcase }.size
70
+ # Returns a sorted two-dimensional array where each member array is a token and its length.
71
+ # The array is sorted by length in descending order.
72
+ #
73
+ # @example
74
+ # Counter.new(%w[one two three four five]).token_lenghts
75
+ # # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
76
+ #
77
+ # @return [Array<Array<String, Integer>>] An array of tokens and their lengths
78
+ def token_lengths
79
+ tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
60
80
  end
61
81
 
62
- private
63
-
64
- def highest_ranking(entries)
65
- entries.group_by { |_, value| value }.sort.last.last
82
+ # Returns a sorted two-dimensional array where each member array is a token and its density
83
+ # as a float, rounded to a precision of two decimal places. It accepts a precision argument
84
+ # which defaults to `2`.
85
+ #
86
+ # @example
87
+ # Counter.new(%w[Maj. Major Major Major]).token_density
88
+ # # => [ ['major', .75], ['maj', .25] ]
89
+ #
90
+ # @example with `precision`
91
+ # Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
92
+ # # => [ ['major', .7500], ['maj', .2500] ]
93
+ #
94
+ # @param [Integer] precision The number of decimal places to round density to
95
+ # @return [Array<Array<String, Float>>] An array of tokens and their densities
96
+ def token_density(precision: 2)
97
+ token_frequency.each_with_object({}) { |(token, freq), hash|
98
+ hash[token] = (freq / token_count.to_f).round(precision)
99
+ }.sort_by_value_desc
66
100
  end
67
101
 
68
- def sort_by_descending_value(entries)
69
- entries.sort_by { |_, value| value }.reverse
102
+ # Returns a hash of tokens and their frequencies for tokens with the highest frequency.
103
+ #
104
+ # @example
105
+ # Counter.new(%w[one once two two twice twice]).most_frequent_tokens
106
+ # # => { 'two' => 2, 'twice' => 2 }
107
+ #
108
+ # @return [Hash{String => Integer}] A hash of tokens and their frequencies
109
+ def most_frequent_tokens
110
+ token_frequency.group_by(&:last).max.last.to_h
70
111
  end
71
112
 
72
- def regexp
73
- @options[:regexp] || WORD_REGEXP
113
+ # Returns a hash of tokens and their lengths for tokens with the highest length
114
+ #
115
+ # @example
116
+ # Counter.new(%w[one three five seven]).longest_tokens
117
+ # # => { 'three' => 5, 'seven' => 5 }
118
+ #
119
+ # @return [Hash{String => Integer}] A hash of tokens and their lengths
120
+ def longest_tokens
121
+ token_lengths.group_by(&:last).max.last.to_h
74
122
  end
75
123
 
76
- def filter_proc(filter)
77
- if filter.respond_to?(:to_a)
78
- filter_procs = Array(filter).map(&method(:filter_proc))
79
- ->(word) {
80
- filter_procs.any? { |p| p.call(word) }
81
- }
82
- elsif filter.respond_to?(:to_str)
83
- exclusion_list = filter.split.collect(&:downcase)
84
- ->(word) {
85
- exclusion_list.include?(word)
86
- }
87
- elsif regexp_filter = Regexp.try_convert(filter)
88
- Proc.new { |word| word =~ regexp_filter }
89
- elsif filter.respond_to?(:to_proc)
90
- filter.to_proc
91
- else
92
- raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
93
- end
124
+ # Returns the average char count per token rounded to a precision of two decimal places.
125
+ # Accepts a `precision` argument.
126
+ #
127
+ # @example
128
+ # Counter.new(%w[one three five seven]).average_chars_per_token
129
+ # # => 4.25
130
+ #
131
+ # @param [Integer] precision The number of decimal places to round average char count to
132
+ # @return [Float] The average char count per token
133
+ def average_chars_per_token(precision: 2)
134
+ (char_count / token_count.to_f).round(precision)
94
135
  end
95
136
  end
96
137
  end
@@ -0,0 +1,78 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ module Deprecated
4
+ # The following methods are deprecated and will be removed in version 1.1.0.
5
+
6
+ # @deprecated use `Counter#token_count`
7
+ def word_count
8
+ warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
9
+ token_count
10
+ end
11
+
12
+ # @deprecated use `Counter#uniq_token_count`
13
+ def unique_word_count
14
+ warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
15
+ uniq_token_count
16
+ end
17
+
18
+ # @deprecated use `Counter#token_frequency`
19
+ def word_occurrences
20
+ warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
21
+ warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
22
+ token_frequency.to_h
23
+ end
24
+
25
+ # @deprecated use `Counter#token_lengths`
26
+ def word_lengths
27
+ warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
28
+ warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
29
+ token_lengths.to_h
30
+ end
31
+
32
+ # @deprecated use `Counter#token_density`
33
+ def word_density(precision = 2)
34
+ warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
35
+ warn "`Counter#token_density` returns density as decimal and not percent"
36
+
37
+ token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
38
+ end
39
+
40
+ # @deprecated use `Counter#token_frequency`
41
+ def sorted_word_occurrences
42
+ warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
43
+ token_frequency
44
+ end
45
+
46
+ # @deprecated use `Counter#token_lengths`
47
+ def sorted_word_lengths
48
+ warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
49
+ token_lengths
50
+ end
51
+
52
+ # @deprecated use `Counter#most_frequent_tokens`
53
+ def most_occurring_words
54
+ warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
55
+ warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
56
+ most_frequent_tokens.to_a
57
+ end
58
+
59
+ # @deprecated use `Counter#longest_tokens`
60
+ def longest_words
61
+ warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
62
+ warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
63
+ longest_tokens.to_a
64
+ end
65
+
66
+ # @deprecated use `Counter#average_chars_per_token`
67
+ def average_chars_per_word(precision = 2)
68
+ warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
69
+ average_chars_per_token(precision: precision)
70
+ end
71
+
72
+ # @deprecated use `Counter#average_chars_per_token`
73
+ def count(token)
74
+ warn "`Counter#count` is deprecated, please use `Array#count`"
75
+ tokens.count(token.downcase)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,163 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ class Tokeniser
4
+ # Takes a string and breaks it into an array of tokens.
5
+ # Using `pattern` and `exclude` allows for powerful tokenisation strategies.
6
+ #
7
+ # @example
8
+ # tokeniser
9
+ # = WordsCounted::Tokeniser.new(
10
+ # "We are all in the gutter, but some of us are looking at the stars."
11
+ # )
12
+ # tokeniser.tokenise(exclude: "We are all in the gutter")
13
+ # # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
14
+
15
+ # Default tokenisation strategy
16
+ TOKEN_REGEXP = /[\p{Alpha}\-']+/
17
+
18
+ # Initialises state with the string to be tokenised.
19
+ #
20
+ # @param [String] input The string to tokenise
21
+ def initialize(input)
22
+ @input = input
23
+ end
24
+
25
+ # Converts a string into an array of tokens using a regular expression.
26
+ # If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
27
+ #
28
+ # Use `exclude` to remove tokens from the final list. `exclude` can be a string,
29
+ # a regular expression, a lambda, a symbol, or an array of one or more of those types.
30
+ # This allows for powerful and flexible tokenisation strategies.
31
+ #
32
+ # If a symbol is passed, it must name a predicate method.
33
+ #
34
+ # @example
35
+ # WordsCounted::Tokeniser.new("Hello World").tokenise
36
+ # # => ['hello', 'world']
37
+ #
38
+ # @example With `pattern`
39
+ # WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
40
+ # # => ['hello', 'mohamad']
41
+ #
42
+ # @example With `exclude` as a string
43
+ # WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
44
+ # # => ['sami']
45
+ #
46
+ # @example With `exclude` as a regexp
47
+ # WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
48
+ # # => ['dani']
49
+ #
50
+ # @example With `exclude` as a lambda
51
+ # WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
52
+ # exclude: ->(token) { token.length > 6 }
53
+ # )
54
+ # # => ['sami']
55
+ #
56
+ # @example With `exclude` as a symbol
57
+ # WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
58
+ # # => ['محمد']
59
+ #
60
+ # @example With `exclude` as an array of strings
61
+ # WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
62
+ # exclude: ["goodbye hello"]
63
+ # )
64
+ # # => ['sami', 'and', dani']
65
+ #
66
+ # @example With `exclude` as an array of regular expressions
67
+ # WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
68
+ # exclude: [/goodbye/i, /and/i]
69
+ # )
70
+ # # => ['hello', 'dani']
71
+ #
72
+ # @example With `exclude` as an array of lambdas
73
+ # t = WordsCounted::Tokeniser.new("Special Agent 007")
74
+ # t.tokenise(
75
+ # exclude: [
76
+ # ->(t) { t.to_i.odd? },
77
+ # ->(t) { t.length > 5}
78
+ # ]
79
+ # )
80
+ # # => ['agent']
81
+ #
82
+ # @example With `exclude` as a mixed array
83
+ # t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
84
+ # t.tokenise(
85
+ # exclude: [
86
+ # :ascii_only?,
87
+ # /محمد/,
88
+ # ->(t) { t.length > 6},
89
+ # "و"
90
+ # ]
91
+ # )
92
+ # # => ["هي", "سامي", "وداني"]
93
+ #
94
+ # @param [Regexp] pattern The string to tokenise
95
+ # @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude The filter to apply
96
+ # @return [Array] The array of filtered tokens
97
+ def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
98
+ filter_proc = filter_to_proc(exclude)
99
+ @input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
100
+ end
101
+
102
+ private
103
+
104
+ # The following methods convert any arguments into a callable object. The return value of this
105
+ # lambda is then used to determine whether a token should be excluded from the final list.
106
+ #
107
+ # `filter` can be a string, a regular expression, a lambda, a symbol, or an array
108
+ # of any combination of those types.
109
+ #
110
+ # If `filter` is a string, it converts the string into an array, and returns a lambda
111
+ # that returns true if the token is included in the resulting array.
112
+ #
113
+ # @see {Tokeniser#filter_proc_from_string}.
114
+ #
115
+ # If `filter` is a an array, it creates a new array where each element of the origingal is
116
+ # converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
117
+ # If any lambda returns true the token is excluded from the final list.
118
+ #
119
+ # @see {Tokeniser#filter_procs_from_array}.
120
+ #
121
+ # If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
122
+ # is returned that checks the token for a match.
123
+ #
124
+ # If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
125
+ #
126
+ # This method depends on `nil` responding `to_a` with an empty array, which
127
+ # avoids having to check if `exclude` was passed.
128
+
129
+ # @api private
130
+ def filter_to_proc(filter)
131
+ if filter.respond_to?(:to_a)
132
+ filter_procs_from_array(filter)
133
+ elsif filter.respond_to?(:to_str)
134
+ filter_proc_from_string(filter)
135
+ elsif regexp_filter = Regexp.try_convert(filter)
136
+ ->(token) {
137
+ token =~ regexp_filter
138
+ }
139
+ elsif filter.respond_to?(:to_proc)
140
+ filter.to_proc
141
+ else
142
+ raise ArgumentError,
143
+ "`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
144
+ end
145
+ end
146
+
147
+ # @api private
148
+ def filter_procs_from_array(filter)
149
+ filter_procs = Array(filter).map &method(:filter_to_proc)
150
+ ->(token) {
151
+ filter_procs.any? { |pro| pro.call(token) }
152
+ }
153
+ end
154
+
155
+ # @api private
156
+ def filter_proc_from_string(filter)
157
+ normalized_exclusion_list = filter.split.map(&:downcase)
158
+ ->(token) {
159
+ normalized_exclusion_list.include?(token)
160
+ }
161
+ end
162
+ end
163
+ end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- VERSION = "0.1.5"
3
+ VERSION = "1.0.3"
4
4
  end
data/lib/words_counted.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require "words_counted/version"
2
+ require "refinements/hash_refinements"
3
+
4
+ require "words_counted/deprecated"
5
+
6
+ require "words_counted/tokeniser"
3
7
  require "words_counted/counter"
8
+ require "words_counted/version"
4
9
 
5
10
  begin
6
11
  require "pry"
@@ -8,11 +13,33 @@ rescue LoadError
8
13
  end
9
14
 
10
15
  module WordsCounted
11
- def self.count(string, options = {})
12
- Counter.new(string, options)
16
+ # Takes a string, tokenises it, and returns an instance of Counter
17
+ # with the resulting tokens.
18
+ #
19
+ # @see Tokeniser.tokenise
20
+ # @see Counter.initialize
21
+ #
22
+ # @param [String] input The input to be tokenised
23
+ # @param [Hash] options The options to pass onto `Counter`
24
+ # @return [WordsCounted::Counter] An instance of Counter
25
+ def self.count(input, options = {})
26
+ tokens = Tokeniser.new(input).tokenise(**options)
27
+ Counter.new(tokens)
13
28
  end
14
29
 
30
+ # Takes a file path, reads the file and tokenises its contents,
31
+ # and returns an instance of Counter with the resulting tokens.
32
+ #
33
+ # @see Tokeniser.tokenise
34
+ # @see Counter.initialize
35
+ #
36
+ # @param [String] path The file to be read and tokenised
37
+ # @param [Hash] options The options to pass onto `Counter`
38
+ # @return [WordsCounted::Counter] An instance of Counter
15
39
  def self.from_file(path, options = {})
16
- Counter.from_file(path, options)
40
+ tokens = File.open(path) do |file|
41
+ Tokeniser.new(file.read).tokenise(**options)
42
+ end
43
+ Counter.new(tokens)
17
44
  end
18
45
  end