words_counted 0.1.5 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.hound.yml +2 -0
- data/.ruby-style.yml +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/.yardopts +3 -2
- data/CHANGELOG.md +29 -0
- data/README.md +146 -189
- data/lib/refinements/hash_refinements.rb +14 -0
- data/lib/words_counted/counter.rb +113 -72
- data/lib/words_counted/deprecated.rb +78 -0
- data/lib/words_counted/tokeniser.rb +163 -0
- data/lib/words_counted/version.rb +1 -1
- data/lib/words_counted.rb +31 -4
- data/spec/words_counted/counter_spec.rb +49 -204
- data/spec/words_counted/deprecated_spec.rb +99 -0
- data/spec/words_counted/tokeniser_spec.rb +133 -0
- data/spec/words_counted_spec.rb +34 -0
- data/words_counted.gemspec +2 -2
- metadata +25 -12
@@ -1,96 +1,137 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
module WordsCounted
|
3
|
-
|
4
|
-
attr_reader :words, :word_occurrences, :word_lengths, :char_count
|
5
|
-
|
6
|
-
WORD_REGEXP = /[\p{Alpha}\-']+/
|
7
|
-
|
8
|
-
def self.from_file(path, options = {})
|
9
|
-
File.open(path) do |file|
|
10
|
-
new file.read, options
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(string, options = {})
|
15
|
-
@options = options
|
16
|
-
exclude = filter_proc(options[:exclude])
|
17
|
-
@words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
|
18
|
-
@char_count = words.join.size
|
19
|
-
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
|
20
|
-
@word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
|
21
|
-
end
|
22
|
-
|
23
|
-
def word_count
|
24
|
-
words.size
|
25
|
-
end
|
26
|
-
|
27
|
-
def unique_word_count
|
28
|
-
words.uniq.size
|
29
|
-
end
|
3
|
+
using Refinements::HashRefinements
|
30
4
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
5
|
+
class Counter
|
6
|
+
# This module contains several methods to extract useful statistics
|
7
|
+
# from any array of tokens, such as density, frequency, and more.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# WordsCounted::Counter.new(["hello", "world"]).token_count
|
11
|
+
# # => 2
|
12
|
+
|
13
|
+
include Deprecated
|
14
|
+
|
15
|
+
# @return [Array<String>] an array of tokens.
|
16
|
+
attr_reader :tokens
|
17
|
+
|
18
|
+
# Initializes state with an array of tokens.
|
19
|
+
#
|
20
|
+
# @param [Array] An array of tokens to perform operations on
|
21
|
+
def initialize(tokens)
|
22
|
+
@tokens = tokens
|
37
23
|
end
|
38
24
|
|
39
|
-
|
40
|
-
|
25
|
+
# Returns the number of tokens.
|
26
|
+
#
|
27
|
+
# @example
|
28
|
+
# Counter.new(%w[one two two three three three]).token_count
|
29
|
+
# # => 6
|
30
|
+
#
|
31
|
+
# @return [Integer] The number of tokens
|
32
|
+
def token_count
|
33
|
+
tokens.size
|
41
34
|
end
|
42
35
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
36
|
+
# Returns the number of unique tokens.
|
37
|
+
#
|
38
|
+
# @example
|
39
|
+
# Counter.new(%w[one two two three three three]).uniq_token_count
|
40
|
+
# # => 3
|
41
|
+
#
|
42
|
+
# @return [Integer] The number of unique tokens
|
43
|
+
def uniq_token_count
|
44
|
+
tokens.uniq.size
|
48
45
|
end
|
49
46
|
|
50
|
-
|
51
|
-
|
47
|
+
# Returns the character count of all tokens.
|
48
|
+
#
|
49
|
+
# @example
|
50
|
+
# Counter.new(%w[one two]).char_count
|
51
|
+
# # => 6
|
52
|
+
#
|
53
|
+
# @return [Integer] The total char count of tokens
|
54
|
+
def char_count
|
55
|
+
tokens.join.size
|
52
56
|
end
|
53
57
|
|
54
|
-
|
55
|
-
|
58
|
+
# Returns a sorted two-dimensional array where each member array is a token and its frequency.
|
59
|
+
# The array is sorted by frequency in descending order.
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
# Counter.new(%w[one two two three three three]).token_frequency
|
63
|
+
# # => [ ['three', 3], ['two', 2], ['one', 1] ]
|
64
|
+
#
|
65
|
+
# @return [Array<Array<String, Integer>>] An array of tokens and their frequencies
|
66
|
+
def token_frequency
|
67
|
+
tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
|
56
68
|
end
|
57
69
|
|
58
|
-
|
59
|
-
|
70
|
+
# Returns a sorted two-dimensional array where each member array is a token and its length.
|
71
|
+
# The array is sorted by length in descending order.
|
72
|
+
#
|
73
|
+
# @example
|
74
|
+
# Counter.new(%w[one two three four five]).token_lenghts
|
75
|
+
# # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
|
76
|
+
#
|
77
|
+
# @return [Array<Array<String, Integer>>] An array of tokens and their lengths
|
78
|
+
def token_lengths
|
79
|
+
tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
|
60
80
|
end
|
61
81
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
82
|
+
# Returns a sorted two-dimensional array where each member array is a token and its density
|
83
|
+
# as a float, rounded to a precision of two decimal places. It accepts a precision argument
|
84
|
+
# which defaults to `2`.
|
85
|
+
#
|
86
|
+
# @example
|
87
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density
|
88
|
+
# # => [ ['major', .75], ['maj', .25] ]
|
89
|
+
#
|
90
|
+
# @example with `precision`
|
91
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
|
92
|
+
# # => [ ['major', .7500], ['maj', .2500] ]
|
93
|
+
#
|
94
|
+
# @param [Integer] precision The number of decimal places to round density to
|
95
|
+
# @return [Array<Array<String, Float>>] An array of tokens and their densities
|
96
|
+
def token_density(precision: 2)
|
97
|
+
token_frequency.each_with_object({}) { |(token, freq), hash|
|
98
|
+
hash[token] = (freq / token_count.to_f).round(precision)
|
99
|
+
}.sort_by_value_desc
|
66
100
|
end
|
67
101
|
|
68
|
-
|
69
|
-
|
102
|
+
# Returns a hash of tokens and their frequencies for tokens with the highest frequency.
|
103
|
+
#
|
104
|
+
# @example
|
105
|
+
# Counter.new(%w[one once two two twice twice]).most_frequent_tokens
|
106
|
+
# # => { 'two' => 2, 'twice' => 2 }
|
107
|
+
#
|
108
|
+
# @return [Hash{String => Integer}] A hash of tokens and their frequencies
|
109
|
+
def most_frequent_tokens
|
110
|
+
token_frequency.group_by(&:last).max.last.to_h
|
70
111
|
end
|
71
112
|
|
72
|
-
|
73
|
-
|
113
|
+
# Returns a hash of tokens and their lengths for tokens with the highest length
|
114
|
+
#
|
115
|
+
# @example
|
116
|
+
# Counter.new(%w[one three five seven]).longest_tokens
|
117
|
+
# # => { 'three' => 5, 'seven' => 5 }
|
118
|
+
#
|
119
|
+
# @return [Hash{String => Integer}] A hash of tokens and their lengths
|
120
|
+
def longest_tokens
|
121
|
+
token_lengths.group_by(&:last).max.last.to_h
|
74
122
|
end
|
75
123
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
elsif regexp_filter = Regexp.try_convert(filter)
|
88
|
-
Proc.new { |word| word =~ regexp_filter }
|
89
|
-
elsif filter.respond_to?(:to_proc)
|
90
|
-
filter.to_proc
|
91
|
-
else
|
92
|
-
raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
|
93
|
-
end
|
124
|
+
# Returns the average char count per token rounded to a precision of two decimal places.
|
125
|
+
# Accepts a `precision` argument.
|
126
|
+
#
|
127
|
+
# @example
|
128
|
+
# Counter.new(%w[one three five seven]).average_chars_per_token
|
129
|
+
# # => 4.25
|
130
|
+
#
|
131
|
+
# @param [Integer] precision The number of decimal places to round average char count to
|
132
|
+
# @return [Float] The average char count per token
|
133
|
+
def average_chars_per_token(precision: 2)
|
134
|
+
(char_count / token_count.to_f).round(precision)
|
94
135
|
end
|
95
136
|
end
|
96
137
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
module Deprecated
|
4
|
+
# The following methods are deprecated and will be removed in version 1.1.0.
|
5
|
+
|
6
|
+
# @deprecated use `Counter#token_count`
|
7
|
+
def word_count
|
8
|
+
warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
|
9
|
+
token_count
|
10
|
+
end
|
11
|
+
|
12
|
+
# @deprecated use `Counter#uniq_token_count`
|
13
|
+
def unique_word_count
|
14
|
+
warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
|
15
|
+
uniq_token_count
|
16
|
+
end
|
17
|
+
|
18
|
+
# @deprecated use `Counter#token_frequency`
|
19
|
+
def word_occurrences
|
20
|
+
warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
21
|
+
warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
|
22
|
+
token_frequency.to_h
|
23
|
+
end
|
24
|
+
|
25
|
+
# @deprecated use `Counter#token_lengths`
|
26
|
+
def word_lengths
|
27
|
+
warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
|
28
|
+
warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
|
29
|
+
token_lengths.to_h
|
30
|
+
end
|
31
|
+
|
32
|
+
# @deprecated use `Counter#token_density`
|
33
|
+
def word_density(precision = 2)
|
34
|
+
warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
|
35
|
+
warn "`Counter#token_density` returns density as decimal and not percent"
|
36
|
+
|
37
|
+
token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
|
38
|
+
end
|
39
|
+
|
40
|
+
# @deprecated use `Counter#token_frequency`
|
41
|
+
def sorted_word_occurrences
|
42
|
+
warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
43
|
+
token_frequency
|
44
|
+
end
|
45
|
+
|
46
|
+
# @deprecated use `Counter#token_lengths`
|
47
|
+
def sorted_word_lengths
|
48
|
+
warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
|
49
|
+
token_lengths
|
50
|
+
end
|
51
|
+
|
52
|
+
# @deprecated use `Counter#most_frequent_tokens`
|
53
|
+
def most_occurring_words
|
54
|
+
warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
|
55
|
+
warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
|
56
|
+
most_frequent_tokens.to_a
|
57
|
+
end
|
58
|
+
|
59
|
+
# @deprecated use `Counter#longest_tokens`
|
60
|
+
def longest_words
|
61
|
+
warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
|
62
|
+
warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
|
63
|
+
longest_tokens.to_a
|
64
|
+
end
|
65
|
+
|
66
|
+
# @deprecated use `Counter#average_chars_per_token`
|
67
|
+
def average_chars_per_word(precision = 2)
|
68
|
+
warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
|
69
|
+
average_chars_per_token(precision: precision)
|
70
|
+
end
|
71
|
+
|
72
|
+
# @deprecated use `Counter#average_chars_per_token`
|
73
|
+
def count(token)
|
74
|
+
warn "`Counter#count` is deprecated, please use `Array#count`"
|
75
|
+
tokens.count(token.downcase)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
class Tokeniser
|
4
|
+
# Takes a string and breaks it into an array of tokens.
|
5
|
+
# Using `pattern` and `exclude` allows for powerful tokenisation strategies.
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# tokeniser
|
9
|
+
# = WordsCounted::Tokeniser.new(
|
10
|
+
# "We are all in the gutter, but some of us are looking at the stars."
|
11
|
+
# )
|
12
|
+
# tokeniser.tokenise(exclude: "We are all in the gutter")
|
13
|
+
# # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
|
14
|
+
|
15
|
+
# Default tokenisation strategy
|
16
|
+
TOKEN_REGEXP = /[\p{Alpha}\-']+/
|
17
|
+
|
18
|
+
# Initialises state with the string to be tokenised.
|
19
|
+
#
|
20
|
+
# @param [String] input The string to tokenise
|
21
|
+
def initialize(input)
|
22
|
+
@input = input
|
23
|
+
end
|
24
|
+
|
25
|
+
# Converts a string into an array of tokens using a regular expression.
|
26
|
+
# If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
|
27
|
+
#
|
28
|
+
# Use `exclude` to remove tokens from the final list. `exclude` can be a string,
|
29
|
+
# a regular expression, a lambda, a symbol, or an array of one or more of those types.
|
30
|
+
# This allows for powerful and flexible tokenisation strategies.
|
31
|
+
#
|
32
|
+
# If a symbol is passed, it must name a predicate method.
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# WordsCounted::Tokeniser.new("Hello World").tokenise
|
36
|
+
# # => ['hello', 'world']
|
37
|
+
#
|
38
|
+
# @example With `pattern`
|
39
|
+
# WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
|
40
|
+
# # => ['hello', 'mohamad']
|
41
|
+
#
|
42
|
+
# @example With `exclude` as a string
|
43
|
+
# WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
|
44
|
+
# # => ['sami']
|
45
|
+
#
|
46
|
+
# @example With `exclude` as a regexp
|
47
|
+
# WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
|
48
|
+
# # => ['dani']
|
49
|
+
#
|
50
|
+
# @example With `exclude` as a lambda
|
51
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
|
52
|
+
# exclude: ->(token) { token.length > 6 }
|
53
|
+
# )
|
54
|
+
# # => ['sami']
|
55
|
+
#
|
56
|
+
# @example With `exclude` as a symbol
|
57
|
+
# WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
|
58
|
+
# # => ['محمد']
|
59
|
+
#
|
60
|
+
# @example With `exclude` as an array of strings
|
61
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
|
62
|
+
# exclude: ["goodbye hello"]
|
63
|
+
# )
|
64
|
+
# # => ['sami', 'and', dani']
|
65
|
+
#
|
66
|
+
# @example With `exclude` as an array of regular expressions
|
67
|
+
# WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
|
68
|
+
# exclude: [/goodbye/i, /and/i]
|
69
|
+
# )
|
70
|
+
# # => ['hello', 'dani']
|
71
|
+
#
|
72
|
+
# @example With `exclude` as an array of lambdas
|
73
|
+
# t = WordsCounted::Tokeniser.new("Special Agent 007")
|
74
|
+
# t.tokenise(
|
75
|
+
# exclude: [
|
76
|
+
# ->(t) { t.to_i.odd? },
|
77
|
+
# ->(t) { t.length > 5}
|
78
|
+
# ]
|
79
|
+
# )
|
80
|
+
# # => ['agent']
|
81
|
+
#
|
82
|
+
# @example With `exclude` as a mixed array
|
83
|
+
# t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
|
84
|
+
# t.tokenise(
|
85
|
+
# exclude: [
|
86
|
+
# :ascii_only?,
|
87
|
+
# /محمد/,
|
88
|
+
# ->(t) { t.length > 6},
|
89
|
+
# "و"
|
90
|
+
# ]
|
91
|
+
# )
|
92
|
+
# # => ["هي", "سامي", "وداني"]
|
93
|
+
#
|
94
|
+
# @param [Regexp] pattern The string to tokenise
|
95
|
+
# @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude The filter to apply
|
96
|
+
# @return [Array] The array of filtered tokens
|
97
|
+
def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
|
98
|
+
filter_proc = filter_to_proc(exclude)
|
99
|
+
@input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
# The following methods convert any arguments into a callable object. The return value of this
|
105
|
+
# lambda is then used to determine whether a token should be excluded from the final list.
|
106
|
+
#
|
107
|
+
# `filter` can be a string, a regular expression, a lambda, a symbol, or an array
|
108
|
+
# of any combination of those types.
|
109
|
+
#
|
110
|
+
# If `filter` is a string, it converts the string into an array, and returns a lambda
|
111
|
+
# that returns true if the token is included in the resulting array.
|
112
|
+
#
|
113
|
+
# @see {Tokeniser#filter_proc_from_string}.
|
114
|
+
#
|
115
|
+
# If `filter` is a an array, it creates a new array where each element of the origingal is
|
116
|
+
# converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
|
117
|
+
# If any lambda returns true the token is excluded from the final list.
|
118
|
+
#
|
119
|
+
# @see {Tokeniser#filter_procs_from_array}.
|
120
|
+
#
|
121
|
+
# If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
|
122
|
+
# is returned that checks the token for a match.
|
123
|
+
#
|
124
|
+
# If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
|
125
|
+
#
|
126
|
+
# This method depends on `nil` responding `to_a` with an empty array, which
|
127
|
+
# avoids having to check if `exclude` was passed.
|
128
|
+
|
129
|
+
# @api private
|
130
|
+
def filter_to_proc(filter)
|
131
|
+
if filter.respond_to?(:to_a)
|
132
|
+
filter_procs_from_array(filter)
|
133
|
+
elsif filter.respond_to?(:to_str)
|
134
|
+
filter_proc_from_string(filter)
|
135
|
+
elsif regexp_filter = Regexp.try_convert(filter)
|
136
|
+
->(token) {
|
137
|
+
token =~ regexp_filter
|
138
|
+
}
|
139
|
+
elsif filter.respond_to?(:to_proc)
|
140
|
+
filter.to_proc
|
141
|
+
else
|
142
|
+
raise ArgumentError,
|
143
|
+
"`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# @api private
|
148
|
+
def filter_procs_from_array(filter)
|
149
|
+
filter_procs = Array(filter).map &method(:filter_to_proc)
|
150
|
+
->(token) {
|
151
|
+
filter_procs.any? { |pro| pro.call(token) }
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
# @api private
|
156
|
+
def filter_proc_from_string(filter)
|
157
|
+
normalized_exclusion_list = filter.split.map(&:downcase)
|
158
|
+
->(token) {
|
159
|
+
normalized_exclusion_list.include?(token)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
data/lib/words_counted.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require "
|
2
|
+
require "refinements/hash_refinements"
|
3
|
+
|
4
|
+
require "words_counted/deprecated"
|
5
|
+
|
6
|
+
require "words_counted/tokeniser"
|
3
7
|
require "words_counted/counter"
|
8
|
+
require "words_counted/version"
|
4
9
|
|
5
10
|
begin
|
6
11
|
require "pry"
|
@@ -8,11 +13,33 @@ rescue LoadError
|
|
8
13
|
end
|
9
14
|
|
10
15
|
module WordsCounted
|
11
|
-
|
12
|
-
|
16
|
+
# Takes a string, tokenises it, and returns an instance of Counter
|
17
|
+
# with the resulting tokens.
|
18
|
+
#
|
19
|
+
# @see Tokeniser.tokenise
|
20
|
+
# @see Counter.initialize
|
21
|
+
#
|
22
|
+
# @param [String] input The input to be tokenised
|
23
|
+
# @param [Hash] options The options to pass onto `Counter`
|
24
|
+
# @return [WordsCounted::Counter] An instance of Counter
|
25
|
+
def self.count(input, options = {})
|
26
|
+
tokens = Tokeniser.new(input).tokenise(**options)
|
27
|
+
Counter.new(tokens)
|
13
28
|
end
|
14
29
|
|
30
|
+
# Takes a file path, reads the file and tokenises its contents,
|
31
|
+
# and returns an instance of Counter with the resulting tokens.
|
32
|
+
#
|
33
|
+
# @see Tokeniser.tokenise
|
34
|
+
# @see Counter.initialize
|
35
|
+
#
|
36
|
+
# @param [String] path The file to be read and tokenised
|
37
|
+
# @param [Hash] options The options to pass onto `Counter`
|
38
|
+
# @return [WordsCounted::Counter] An instance of Counter
|
15
39
|
def self.from_file(path, options = {})
|
16
|
-
|
40
|
+
tokens = File.open(path) do |file|
|
41
|
+
Tokeniser.new(file.read).tokenise(**options)
|
42
|
+
end
|
43
|
+
Counter.new(tokens)
|
17
44
|
end
|
18
45
|
end
|