words_counted 0.1.5 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.hound.yml +2 -0
- data/.ruby-style.yml +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/.yardopts +3 -2
- data/CHANGELOG.md +29 -0
- data/README.md +146 -189
- data/lib/refinements/hash_refinements.rb +14 -0
- data/lib/words_counted/counter.rb +113 -72
- data/lib/words_counted/deprecated.rb +78 -0
- data/lib/words_counted/tokeniser.rb +163 -0
- data/lib/words_counted/version.rb +1 -1
- data/lib/words_counted.rb +31 -4
- data/spec/words_counted/counter_spec.rb +49 -204
- data/spec/words_counted/deprecated_spec.rb +99 -0
- data/spec/words_counted/tokeniser_spec.rb +133 -0
- data/spec/words_counted_spec.rb +34 -0
- data/words_counted.gemspec +2 -2
- metadata +25 -12
@@ -1,96 +1,137 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
module WordsCounted
|
3
|
-
|
4
|
-
attr_reader :words, :word_occurrences, :word_lengths, :char_count
|
5
|
-
|
6
|
-
WORD_REGEXP = /[\p{Alpha}\-']+/
|
7
|
-
|
8
|
-
def self.from_file(path, options = {})
|
9
|
-
File.open(path) do |file|
|
10
|
-
new file.read, options
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(string, options = {})
|
15
|
-
@options = options
|
16
|
-
exclude = filter_proc(options[:exclude])
|
17
|
-
@words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
|
18
|
-
@char_count = words.join.size
|
19
|
-
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
|
20
|
-
@word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
|
21
|
-
end
|
22
|
-
|
23
|
-
def word_count
|
24
|
-
words.size
|
25
|
-
end
|
26
|
-
|
27
|
-
def unique_word_count
|
28
|
-
words.uniq.size
|
29
|
-
end
|
3
|
+
using Refinements::HashRefinements
|
30
4
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
5
|
+
class Counter
|
6
|
+
# This module contains several methods to extract useful statistics
|
7
|
+
# from any array of tokens, such as density, frequency, and more.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# WordsCounted::Counter.new(["hello", "world"]).token_count
|
11
|
+
# # => 2
|
12
|
+
|
13
|
+
include Deprecated
|
14
|
+
|
15
|
+
# @return [Array<String>] an array of tokens.
|
16
|
+
attr_reader :tokens
|
17
|
+
|
18
|
+
# Initializes state with an array of tokens.
|
19
|
+
#
|
20
|
+
# @param [Array] An array of tokens to perform operations on
|
21
|
+
def initialize(tokens)
|
22
|
+
@tokens = tokens
|
37
23
|
end
|
38
24
|
|
39
|
-
|
40
|
-
|
25
|
+
# Returns the number of tokens.
|
26
|
+
#
|
27
|
+
# @example
|
28
|
+
# Counter.new(%w[one two two three three three]).token_count
|
29
|
+
# # => 6
|
30
|
+
#
|
31
|
+
# @return [Integer] The number of tokens
|
32
|
+
def token_count
|
33
|
+
tokens.size
|
41
34
|
end
|
42
35
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
36
|
+
# Returns the number of unique tokens.
|
37
|
+
#
|
38
|
+
# @example
|
39
|
+
# Counter.new(%w[one two two three three three]).uniq_token_count
|
40
|
+
# # => 3
|
41
|
+
#
|
42
|
+
# @return [Integer] The number of unique tokens
|
43
|
+
def uniq_token_count
|
44
|
+
tokens.uniq.size
|
48
45
|
end
|
49
46
|
|
50
|
-
|
51
|
-
|
47
|
+
# Returns the character count of all tokens.
|
48
|
+
#
|
49
|
+
# @example
|
50
|
+
# Counter.new(%w[one two]).char_count
|
51
|
+
# # => 6
|
52
|
+
#
|
53
|
+
# @return [Integer] The total char count of tokens
|
54
|
+
def char_count
|
55
|
+
tokens.join.size
|
52
56
|
end
|
53
57
|
|
54
|
-
|
55
|
-
|
58
|
+
# Returns a sorted two-dimensional array where each member array is a token and its frequency.
|
59
|
+
# The array is sorted by frequency in descending order.
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
# Counter.new(%w[one two two three three three]).token_frequency
|
63
|
+
# # => [ ['three', 3], ['two', 2], ['one', 1] ]
|
64
|
+
#
|
65
|
+
# @return [Array<Array<String, Integer>>] An array of tokens and their frequencies
|
66
|
+
def token_frequency
|
67
|
+
tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
|
56
68
|
end
|
57
69
|
|
58
|
-
|
59
|
-
|
70
|
+
# Returns a sorted two-dimensional array where each member array is a token and its length.
|
71
|
+
# The array is sorted by length in descending order.
|
72
|
+
#
|
73
|
+
# @example
|
74
|
+
# Counter.new(%w[one two three four five]).token_lenghts
|
75
|
+
# # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
|
76
|
+
#
|
77
|
+
# @return [Array<Array<String, Integer>>] An array of tokens and their lengths
|
78
|
+
def token_lengths
|
79
|
+
tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
|
60
80
|
end
|
61
81
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
82
|
+
# Returns a sorted two-dimensional array where each member array is a token and its density
|
83
|
+
# as a float, rounded to a precision of two decimal places. It accepts a precision argument
|
84
|
+
# which defaults to `2`.
|
85
|
+
#
|
86
|
+
# @example
|
87
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density
|
88
|
+
# # => [ ['major', .75], ['maj', .25] ]
|
89
|
+
#
|
90
|
+
# @example with `precision`
|
91
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
|
92
|
+
# # => [ ['major', .7500], ['maj', .2500] ]
|
93
|
+
#
|
94
|
+
# @param [Integer] precision The number of decimal places to round density to
|
95
|
+
# @return [Array<Array<String, Float>>] An array of tokens and their densities
|
96
|
+
def token_density(precision: 2)
|
97
|
+
token_frequency.each_with_object({}) { |(token, freq), hash|
|
98
|
+
hash[token] = (freq / token_count.to_f).round(precision)
|
99
|
+
}.sort_by_value_desc
|
66
100
|
end
|
67
101
|
|
68
|
-
|
69
|
-
|
102
|
+
# Returns a hash of tokens and their frequencies for tokens with the highest frequency.
|
103
|
+
#
|
104
|
+
# @example
|
105
|
+
# Counter.new(%w[one once two two twice twice]).most_frequent_tokens
|
106
|
+
# # => { 'two' => 2, 'twice' => 2 }
|
107
|
+
#
|
108
|
+
# @return [Hash{String => Integer}] A hash of tokens and their frequencies
|
109
|
+
def most_frequent_tokens
|
110
|
+
token_frequency.group_by(&:last).max.last.to_h
|
70
111
|
end
|
71
112
|
|
72
|
-
|
73
|
-
|
113
|
+
# Returns a hash of tokens and their lengths for tokens with the highest length
|
114
|
+
#
|
115
|
+
# @example
|
116
|
+
# Counter.new(%w[one three five seven]).longest_tokens
|
117
|
+
# # => { 'three' => 5, 'seven' => 5 }
|
118
|
+
#
|
119
|
+
# @return [Hash{String => Integer}] A hash of tokens and their lengths
|
120
|
+
def longest_tokens
|
121
|
+
token_lengths.group_by(&:last).max.last.to_h
|
74
122
|
end
|
75
123
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
elsif regexp_filter = Regexp.try_convert(filter)
|
88
|
-
Proc.new { |word| word =~ regexp_filter }
|
89
|
-
elsif filter.respond_to?(:to_proc)
|
90
|
-
filter.to_proc
|
91
|
-
else
|
92
|
-
raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
|
93
|
-
end
|
124
|
+
# Returns the average char count per token rounded to a precision of two decimal places.
|
125
|
+
# Accepts a `precision` argument.
|
126
|
+
#
|
127
|
+
# @example
|
128
|
+
# Counter.new(%w[one three five seven]).average_chars_per_token
|
129
|
+
# # => 4.25
|
130
|
+
#
|
131
|
+
# @param [Integer] precision The number of decimal places to round average char count to
|
132
|
+
# @return [Float] The average char count per token
|
133
|
+
def average_chars_per_token(precision: 2)
|
134
|
+
(char_count / token_count.to_f).round(precision)
|
94
135
|
end
|
95
136
|
end
|
96
137
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
module Deprecated
|
4
|
+
# The following methods are deprecated and will be removed in version 1.1.0.
|
5
|
+
|
6
|
+
# @deprecated use `Counter#token_count`
|
7
|
+
def word_count
|
8
|
+
warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
|
9
|
+
token_count
|
10
|
+
end
|
11
|
+
|
12
|
+
# @deprecated use `Counter#uniq_token_count`
|
13
|
+
def unique_word_count
|
14
|
+
warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
|
15
|
+
uniq_token_count
|
16
|
+
end
|
17
|
+
|
18
|
+
# @deprecated use `Counter#token_frequency`
|
19
|
+
def word_occurrences
|
20
|
+
warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
21
|
+
warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
|
22
|
+
token_frequency.to_h
|
23
|
+
end
|
24
|
+
|
25
|
+
# @deprecated use `Counter#token_lengths`
|
26
|
+
def word_lengths
|
27
|
+
warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
|
28
|
+
warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
|
29
|
+
token_lengths.to_h
|
30
|
+
end
|
31
|
+
|
32
|
+
# @deprecated use `Counter#token_density`
|
33
|
+
def word_density(precision = 2)
|
34
|
+
warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
|
35
|
+
warn "`Counter#token_density` returns density as decimal and not percent"
|
36
|
+
|
37
|
+
token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
|
38
|
+
end
|
39
|
+
|
40
|
+
# @deprecated use `Counter#token_frequency`
|
41
|
+
def sorted_word_occurrences
|
42
|
+
warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
43
|
+
token_frequency
|
44
|
+
end
|
45
|
+
|
46
|
+
# @deprecated use `Counter#token_lengths`
|
47
|
+
def sorted_word_lengths
|
48
|
+
warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
|
49
|
+
token_lengths
|
50
|
+
end
|
51
|
+
|
52
|
+
# @deprecated use `Counter#most_frequent_tokens`
|
53
|
+
def most_occurring_words
|
54
|
+
warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
|
55
|
+
warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
|
56
|
+
most_frequent_tokens.to_a
|
57
|
+
end
|
58
|
+
|
59
|
+
# @deprecated use `Counter#longest_tokens`
|
60
|
+
def longest_words
|
61
|
+
warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
|
62
|
+
warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
|
63
|
+
longest_tokens.to_a
|
64
|
+
end
|
65
|
+
|
66
|
+
# @deprecated use `Counter#average_chars_per_token`
|
67
|
+
def average_chars_per_word(precision = 2)
|
68
|
+
warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
|
69
|
+
average_chars_per_token(precision: precision)
|
70
|
+
end
|
71
|
+
|
72
|
+
# @deprecated use `Counter#average_chars_per_token`
|
73
|
+
def count(token)
|
74
|
+
warn "`Counter#count` is deprecated, please use `Array#count`"
|
75
|
+
tokens.count(token.downcase)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
class Tokeniser
|
4
|
+
# Takes a string and breaks it into an array of tokens.
|
5
|
+
# Using `pattern` and `exclude` allows for powerful tokenisation strategies.
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# tokeniser
|
9
|
+
# = WordsCounted::Tokeniser.new(
|
10
|
+
# "We are all in the gutter, but some of us are looking at the stars."
|
11
|
+
# )
|
12
|
+
# tokeniser.tokenise(exclude: "We are all in the gutter")
|
13
|
+
# # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
|
14
|
+
|
15
|
+
# Default tokenisation strategy
|
16
|
+
TOKEN_REGEXP = /[\p{Alpha}\-']+/
|
17
|
+
|
18
|
+
# Initialises state with the string to be tokenised.
|
19
|
+
#
|
20
|
+
# @param [String] input The string to tokenise
|
21
|
+
def initialize(input)
|
22
|
+
@input = input
|
23
|
+
end
|
24
|
+
|
25
|
+
# Converts a string into an array of tokens using a regular expression.
|
26
|
+
# If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
|
27
|
+
#
|
28
|
+
# Use `exclude` to remove tokens from the final list. `exclude` can be a string,
|
29
|
+
# a regular expression, a lambda, a symbol, or an array of one or more of those types.
|
30
|
+
# This allows for powerful and flexible tokenisation strategies.
|
31
|
+
#
|
32
|
+
# If a symbol is passed, it must name a predicate method.
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# WordsCounted::Tokeniser.new("Hello World").tokenise
|
36
|
+
# # => ['hello', 'world']
|
37
|
+
#
|
38
|
+
# @example With `pattern`
|
39
|
+
# WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
|
40
|
+
# # => ['hello', 'mohamad']
|
41
|
+
#
|
42
|
+
# @example With `exclude` as a string
|
43
|
+
# WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
|
44
|
+
# # => ['sami']
|
45
|
+
#
|
46
|
+
# @example With `exclude` as a regexp
|
47
|
+
# WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
|
48
|
+
# # => ['dani']
|
49
|
+
#
|
50
|
+
# @example With `exclude` as a lambda
|
51
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
|
52
|
+
# exclude: ->(token) { token.length > 6 }
|
53
|
+
# )
|
54
|
+
# # => ['sami']
|
55
|
+
#
|
56
|
+
# @example With `exclude` as a symbol
|
57
|
+
# WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
|
58
|
+
# # => ['محمد']
|
59
|
+
#
|
60
|
+
# @example With `exclude` as an array of strings
|
61
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
|
62
|
+
# exclude: ["goodbye hello"]
|
63
|
+
# )
|
64
|
+
# # => ['sami', 'and', dani']
|
65
|
+
#
|
66
|
+
# @example With `exclude` as an array of regular expressions
|
67
|
+
# WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
|
68
|
+
# exclude: [/goodbye/i, /and/i]
|
69
|
+
# )
|
70
|
+
# # => ['hello', 'dani']
|
71
|
+
#
|
72
|
+
# @example With `exclude` as an array of lambdas
|
73
|
+
# t = WordsCounted::Tokeniser.new("Special Agent 007")
|
74
|
+
# t.tokenise(
|
75
|
+
# exclude: [
|
76
|
+
# ->(t) { t.to_i.odd? },
|
77
|
+
# ->(t) { t.length > 5}
|
78
|
+
# ]
|
79
|
+
# )
|
80
|
+
# # => ['agent']
|
81
|
+
#
|
82
|
+
# @example With `exclude` as a mixed array
|
83
|
+
# t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
|
84
|
+
# t.tokenise(
|
85
|
+
# exclude: [
|
86
|
+
# :ascii_only?,
|
87
|
+
# /محمد/,
|
88
|
+
# ->(t) { t.length > 6},
|
89
|
+
# "و"
|
90
|
+
# ]
|
91
|
+
# )
|
92
|
+
# # => ["هي", "سامي", "وداني"]
|
93
|
+
#
|
94
|
+
# @param [Regexp] pattern The string to tokenise
|
95
|
+
# @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude The filter to apply
|
96
|
+
# @return [Array] The array of filtered tokens
|
97
|
+
def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
|
98
|
+
filter_proc = filter_to_proc(exclude)
|
99
|
+
@input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
# The following methods convert any arguments into a callable object. The return value of this
|
105
|
+
# lambda is then used to determine whether a token should be excluded from the final list.
|
106
|
+
#
|
107
|
+
# `filter` can be a string, a regular expression, a lambda, a symbol, or an array
|
108
|
+
# of any combination of those types.
|
109
|
+
#
|
110
|
+
# If `filter` is a string, it converts the string into an array, and returns a lambda
|
111
|
+
# that returns true if the token is included in the resulting array.
|
112
|
+
#
|
113
|
+
# @see {Tokeniser#filter_proc_from_string}.
|
114
|
+
#
|
115
|
+
# If `filter` is a an array, it creates a new array where each element of the origingal is
|
116
|
+
# converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
|
117
|
+
# If any lambda returns true the token is excluded from the final list.
|
118
|
+
#
|
119
|
+
# @see {Tokeniser#filter_procs_from_array}.
|
120
|
+
#
|
121
|
+
# If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
|
122
|
+
# is returned that checks the token for a match.
|
123
|
+
#
|
124
|
+
# If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
|
125
|
+
#
|
126
|
+
# This method depends on `nil` responding `to_a` with an empty array, which
|
127
|
+
# avoids having to check if `exclude` was passed.
|
128
|
+
|
129
|
+
# @api private
|
130
|
+
def filter_to_proc(filter)
|
131
|
+
if filter.respond_to?(:to_a)
|
132
|
+
filter_procs_from_array(filter)
|
133
|
+
elsif filter.respond_to?(:to_str)
|
134
|
+
filter_proc_from_string(filter)
|
135
|
+
elsif regexp_filter = Regexp.try_convert(filter)
|
136
|
+
->(token) {
|
137
|
+
token =~ regexp_filter
|
138
|
+
}
|
139
|
+
elsif filter.respond_to?(:to_proc)
|
140
|
+
filter.to_proc
|
141
|
+
else
|
142
|
+
raise ArgumentError,
|
143
|
+
"`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# @api private
|
148
|
+
def filter_procs_from_array(filter)
|
149
|
+
filter_procs = Array(filter).map &method(:filter_to_proc)
|
150
|
+
->(token) {
|
151
|
+
filter_procs.any? { |pro| pro.call(token) }
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
# @api private
|
156
|
+
def filter_proc_from_string(filter)
|
157
|
+
normalized_exclusion_list = filter.split.map(&:downcase)
|
158
|
+
->(token) {
|
159
|
+
normalized_exclusion_list.include?(token)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
data/lib/words_counted.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require "
|
2
|
+
require "refinements/hash_refinements"
|
3
|
+
|
4
|
+
require "words_counted/deprecated"
|
5
|
+
|
6
|
+
require "words_counted/tokeniser"
|
3
7
|
require "words_counted/counter"
|
8
|
+
require "words_counted/version"
|
4
9
|
|
5
10
|
begin
|
6
11
|
require "pry"
|
@@ -8,11 +13,33 @@ rescue LoadError
|
|
8
13
|
end
|
9
14
|
|
10
15
|
module WordsCounted
|
11
|
-
|
12
|
-
|
16
|
+
# Takes a string, tokenises it, and returns an instance of Counter
|
17
|
+
# with the resulting tokens.
|
18
|
+
#
|
19
|
+
# @see Tokeniser.tokenise
|
20
|
+
# @see Counter.initialize
|
21
|
+
#
|
22
|
+
# @param [String] input The input to be tokenised
|
23
|
+
# @param [Hash] options The options to pass onto `Counter`
|
24
|
+
# @return [WordsCounted::Counter] An instance of Counter
|
25
|
+
def self.count(input, options = {})
|
26
|
+
tokens = Tokeniser.new(input).tokenise(**options)
|
27
|
+
Counter.new(tokens)
|
13
28
|
end
|
14
29
|
|
30
|
+
# Takes a file path, reads the file and tokenises its contents,
|
31
|
+
# and returns an instance of Counter with the resulting tokens.
|
32
|
+
#
|
33
|
+
# @see Tokeniser.tokenise
|
34
|
+
# @see Counter.initialize
|
35
|
+
#
|
36
|
+
# @param [String] path The file to be read and tokenised
|
37
|
+
# @param [Hash] options The options to pass onto `Counter`
|
38
|
+
# @return [WordsCounted::Counter] An instance of Counter
|
15
39
|
def self.from_file(path, options = {})
|
16
|
-
|
40
|
+
tokens = File.open(path) do |file|
|
41
|
+
Tokeniser.new(file.read).tokenise(**options)
|
42
|
+
end
|
43
|
+
Counter.new(tokens)
|
17
44
|
end
|
18
45
|
end
|