words_counted 0.1.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.hound.yml +2 -0
- data/.ruby-style.yml +2 -0
- data/.travis.yml +9 -0
- data/.yardopts +3 -2
- data/CHANGELOG.md +24 -0
- data/lib/refinements/hash_refinements.rb +10 -0
- data/lib/words_counted/counter.rb +101 -69
- data/lib/words_counted/deprecated.rb +76 -0
- data/lib/words_counted/tokeniser.rb +139 -0
- data/lib/words_counted/version.rb +1 -1
- data/lib/words_counted.rb +10 -3
- data/spec/words_counted/counter_spec.rb +49 -204
- data/spec/words_counted/deprecated_spec.rb +99 -0
- data/spec/words_counted/tokeniser_spec.rb +133 -0
- data/spec/words_counted_spec.rb +34 -0
- data/words_counted.gemspec +1 -1
- metadata +17 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6302c1802d7da076d1ddafdcbe70e46a89c8f33
|
4
|
+
data.tar.gz: 873efaa5e58f883e0dde99094ca53952d46217c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0e6ddb8db9c060432066d86aed2efe20aa95dee2019d54c950007170c0ffbbcff16fa27a0377419b0d1b718be1625a4376ee9c687a4ae67073aaffe9ef363157
|
7
|
+
data.tar.gz: 9df2a0cefe14b9ac77d1741f8980d1b1fb4d8b770738fbd69c8870f73da4b653a1d9462ac8813f88dc48af36e03718773523985f5be0f4999177a6b0a2a89662
|
data/.gitignore
CHANGED
data/.hound.yml
ADDED
data/.ruby-style.yml
ADDED
data/.travis.yml
ADDED
data/.yardopts
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,27 @@
|
|
1
|
+
## Version 1.0
|
2
|
+
|
3
|
+
This version brings lots of improvements to code organisation. The tokeniser has been extracted into its own class. All methods in `Counter` have either renamed or deprecated. Deprecated methods and their tests have moved into their own modules. Using them will trigger warnings with upgrade instructions outlined below.
|
4
|
+
|
5
|
+
1. Extracted tokenisation behaviour from `Counter` into a `Tokeniser` class.
|
6
|
+
2. Deprecated all methods that have `word` in their name. Most are renamed such that `word` became `token`. They will be removed in version 1.1.
|
7
|
+
- Deprecated `word_count` in favor of `token_count`
|
8
|
+
- Deprecated `unique_word_count` in favor of `unique_token_count`
|
9
|
+
- Deprecated `word_occurrences` and `sorted_word_occurrences` in favor of `token_frequency`
|
10
|
+
- Deprecated `word_lengths` and `sorted_word_lengths` in favor of `token_lenghts`
|
11
|
+
- Deprecated `word_density` in favor of `token_density`
|
12
|
+
- Deprecated `most_occurring_words` in favor of `most_frequent_tokens`
|
13
|
+
- Deprecated `longest_words` in favor of `longest_tokens`
|
14
|
+
- Deprecated `average_chars_per_word` in favor of `average_chars_per_token`
|
15
|
+
- Deprecated `count`. Use `Array#count` instead.
|
16
|
+
3. `token_lengths`, which replaces `word_lengths` returns a sorted two-dimensional array instead of a hash. It behaves exactly like `sorted_word_lengths` which has been deprecated. Use `token_lengths.to_h` for old behaviour.
|
17
|
+
4. `token_frequency`, which replaces `word_occurences` returns a sorted two-dimensional array instead of a hash. It behaves like `sorted_word_occurrences` which has been deprecated. Use `token_frequency.to_h` for old behaviour.
|
18
|
+
5. `token_density`, which replaces `word_density`, returns a decimal with a precision of 2, not a percent. Use `token_density * 100` for old behaviour.
|
19
|
+
6. Add a refinement to Hash under `lib/refinements/hash_refinements.rb` to quickly sort by descending value.
|
20
|
+
7. Extracted all deprecated methods to their own module, and their tests to their own spec file.
|
21
|
+
8. Added a base `words_counted_spec.rb` and moved `.from_file` test to the new file.
|
22
|
+
9. Added Travis continuous integration.
|
23
|
+
10. Add documentation to the code.
|
24
|
+
|
1
25
|
## Version 0.1.5
|
2
26
|
|
3
27
|
1. Removed `to_f` from the dividend in `average_chars_per_word` and `word_densities`. The divisor is a float, and dividing by a float returns a float.
|
@@ -1,96 +1,128 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
module WordsCounted
|
3
|
-
class Counter
|
4
|
-
attr_reader :words, :word_occurrences, :word_lengths, :char_count
|
5
|
-
|
6
|
-
WORD_REGEXP = /[\p{Alpha}\-']+/
|
7
2
|
|
8
|
-
|
9
|
-
File.open(path) do |file|
|
10
|
-
new file.read, options
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(string, options = {})
|
15
|
-
@options = options
|
16
|
-
exclude = filter_proc(options[:exclude])
|
17
|
-
@words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
|
18
|
-
@char_count = words.join.size
|
19
|
-
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
|
20
|
-
@word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
|
21
|
-
end
|
3
|
+
require "words_counted/deprecated"
|
22
4
|
|
23
|
-
|
24
|
-
|
25
|
-
end
|
5
|
+
module WordsCounted
|
6
|
+
using Refinements::HashRefinements
|
26
7
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
8
|
+
class Counter
|
9
|
+
include Deprecated
|
30
10
|
|
31
|
-
|
32
|
-
(char_count / word_count.to_f).round(precision)
|
33
|
-
end
|
11
|
+
attr_reader :tokens
|
34
12
|
|
35
|
-
def
|
36
|
-
|
13
|
+
def initialize(tokens)
|
14
|
+
@tokens = tokens
|
37
15
|
end
|
38
16
|
|
39
|
-
|
40
|
-
|
17
|
+
# Returns the number of tokens.
|
18
|
+
#
|
19
|
+
# @example
|
20
|
+
# Counter.new(%w[one two two three three three]).token_count
|
21
|
+
# # => 6
|
22
|
+
#
|
23
|
+
# @return [Integer] The number of tokens.
|
24
|
+
def token_count
|
25
|
+
tokens.size
|
41
26
|
end
|
42
27
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
28
|
+
# Returns the number of unique tokens.
|
29
|
+
#
|
30
|
+
# @example
|
31
|
+
# Counter.new(%w[one two two three three three]).uniq_token_count
|
32
|
+
# # => 3
|
33
|
+
#
|
34
|
+
# @return [Integer] The number of unique tokens.
|
35
|
+
def uniq_token_count
|
36
|
+
tokens.uniq.size
|
48
37
|
end
|
49
38
|
|
50
|
-
|
51
|
-
|
39
|
+
# Returns the character count of all tokens.
|
40
|
+
#
|
41
|
+
# @example
|
42
|
+
# Counter.new(%w[one two]).char_count
|
43
|
+
# # => 6
|
44
|
+
#
|
45
|
+
# @return [Integer] The total char count of tokens.
|
46
|
+
def char_count
|
47
|
+
tokens.join.size
|
52
48
|
end
|
53
49
|
|
54
|
-
|
55
|
-
|
50
|
+
# Returns a sorted two-dimensional array where each member array is a token and its frequency.
|
51
|
+
# The array is sorted by frequency in descending order.
|
52
|
+
#
|
53
|
+
# @example
|
54
|
+
# Counter.new(%w[one two two three three three]).token_frequency
|
55
|
+
# # => [ ['three', 3], ['two', 2], ['one', 1] ]
|
56
|
+
#
|
57
|
+
# @return [Array<Array<String, Integer>>]
|
58
|
+
def token_frequency
|
59
|
+
tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
|
56
60
|
end
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
# Returns a sorted two-dimensional array where each member array is a token and its length.
|
63
|
+
# The array is sorted by length in descending order.
|
64
|
+
#
|
65
|
+
# @example
|
66
|
+
# Counter.new(%w[one two three four five]).token_lenghts
|
67
|
+
# # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
|
68
|
+
#
|
69
|
+
# @return [Array<Array<String, Integer>>]
|
70
|
+
def token_lengths
|
71
|
+
tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
|
60
72
|
end
|
61
73
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
74
|
+
# Returns a sorted two-dimensional array where each member array is a token and its density
|
75
|
+
# as a float, rounded to a precision of two decimal places. It accepts a precision argument
|
76
|
+
# which defaults to `2`.
|
77
|
+
#
|
78
|
+
# @example
|
79
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density
|
80
|
+
# # => [ ['major', .75], ['maj', .25] ]
|
81
|
+
#
|
82
|
+
# @example with `precision`
|
83
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
|
84
|
+
# # => [ ['major', .7500], ['maj', .2500] ]
|
85
|
+
#
|
86
|
+
# @param [Integer] precision The number of decimal places to round density to.
|
87
|
+
# @return [Array<Array<String, Float>>]
|
88
|
+
def token_density(precision: 2)
|
89
|
+
token_frequency.each_with_object({}) { |(token, freq), hash|
|
90
|
+
hash[token] = (freq / token_count.to_f).round(precision)
|
91
|
+
}.sort_by_value_desc
|
66
92
|
end
|
67
93
|
|
68
|
-
|
69
|
-
|
94
|
+
# Returns a hash of tokens and their frequencies for tokens with the highest frequency.
|
95
|
+
#
|
96
|
+
# @example
|
97
|
+
# Counter.new(%w[one once two two twice twice]).most_frequent_tokens
|
98
|
+
# # => { 'two' => 2, 'twice' => 2 }
|
99
|
+
#
|
100
|
+
# @return [Hash<String, Integer>]
|
101
|
+
def most_frequent_tokens
|
102
|
+
token_frequency.group_by(&:last).max.last.to_h
|
70
103
|
end
|
71
104
|
|
72
|
-
|
73
|
-
|
105
|
+
# Returns a hash of tokens and their lengths for tokens with the highest length.
|
106
|
+
#
|
107
|
+
# @example
|
108
|
+
# Counter.new(%w[one three five seven]).longest_tokens
|
109
|
+
# # => { 'three' => 5, 'seven' => 5 }
|
110
|
+
#
|
111
|
+
# @return [Hash<String, Integer>]
|
112
|
+
def longest_tokens
|
113
|
+
token_lengths.group_by(&:last).max.last.to_h
|
74
114
|
end
|
75
115
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
}
|
87
|
-
elsif regexp_filter = Regexp.try_convert(filter)
|
88
|
-
Proc.new { |word| word =~ regexp_filter }
|
89
|
-
elsif filter.respond_to?(:to_proc)
|
90
|
-
filter.to_proc
|
91
|
-
else
|
92
|
-
raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
|
93
|
-
end
|
116
|
+
# Returns the average char count per token rounded to a precision of two decimal places.
|
117
|
+
# Accepts a `precision` argument.
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# Counter.new(%w[one three five seven]).average_chars_per_token
|
121
|
+
# # => 4.25
|
122
|
+
#
|
123
|
+
# @return [Float] The average char count per token.
|
124
|
+
def average_chars_per_token(precision: 2)
|
125
|
+
(char_count / token_count.to_f).round(precision)
|
94
126
|
end
|
95
127
|
end
|
96
128
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
module Deprecated
|
4
|
+
# @deprecated use `Counter#token_count`
|
5
|
+
def word_count
|
6
|
+
warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
|
7
|
+
token_count
|
8
|
+
end
|
9
|
+
|
10
|
+
# @deprecated use `Counter#uniq_token_count`
|
11
|
+
def unique_word_count
|
12
|
+
warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
|
13
|
+
uniq_token_count
|
14
|
+
end
|
15
|
+
|
16
|
+
# @deprecated use `Counter#token_frequency`
|
17
|
+
def word_occurrences
|
18
|
+
warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
19
|
+
warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
|
20
|
+
token_frequency.to_h
|
21
|
+
end
|
22
|
+
|
23
|
+
# @deprecated use `Counter#token_lengths`
|
24
|
+
def word_lengths
|
25
|
+
warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
|
26
|
+
warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
|
27
|
+
token_lengths.to_h
|
28
|
+
end
|
29
|
+
|
30
|
+
# @deprecated use `Counter#token_density`
|
31
|
+
def word_density(precision = 2)
|
32
|
+
warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
|
33
|
+
warn "`Counter#token_density` returns density as decimal and not percent"
|
34
|
+
|
35
|
+
token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
|
36
|
+
end
|
37
|
+
|
38
|
+
# @deprecated use `Counter#token_frequency`
|
39
|
+
def sorted_word_occurrences
|
40
|
+
warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
41
|
+
token_frequency
|
42
|
+
end
|
43
|
+
|
44
|
+
# @deprecated use `Counter#token_lengths`
|
45
|
+
def sorted_word_lengths
|
46
|
+
warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
|
47
|
+
token_lengths
|
48
|
+
end
|
49
|
+
|
50
|
+
# @deprecated use `Counter#most_frequent_tokens`
|
51
|
+
def most_occurring_words
|
52
|
+
warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
|
53
|
+
warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
|
54
|
+
most_frequent_tokens.to_a
|
55
|
+
end
|
56
|
+
|
57
|
+
# @deprecated use `Counter#longest_tokens`
|
58
|
+
def longest_words
|
59
|
+
warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
|
60
|
+
warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
|
61
|
+
longest_tokens.to_a
|
62
|
+
end
|
63
|
+
|
64
|
+
# @deprecated use `Counter#average_chars_per_token`
|
65
|
+
def average_chars_per_word(precision = 2)
|
66
|
+
warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
|
67
|
+
average_chars_per_token(precision: precision)
|
68
|
+
end
|
69
|
+
|
70
|
+
# @deprecated use `Counter#average_chars_per_token`
|
71
|
+
def count(token)
|
72
|
+
warn "`Counter#count` is deprecated, please use `Array#count`"
|
73
|
+
tokens.count(token.downcase)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
class Tokeniser
|
4
|
+
# Takes a string and breaks it into an array of tokens.
|
5
|
+
# Using `pattern` and `exclude` allows for powerful tokenisation strategies.
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# tokeniser = WordsCounted::Tokeniser.new("We are all in the gutter, but some of us are looking at the stars.")
|
9
|
+
# tokeniser.tokenise(exclude: "We are all in the gutter")
|
10
|
+
# # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
|
11
|
+
|
12
|
+
# Default tokenisation strategy
|
13
|
+
TOKEN_REGEXP = /[\p{Alpha}\-']+/
|
14
|
+
|
15
|
+
# Initialises state with a string that will be tokenised.
|
16
|
+
#
|
17
|
+
# @param [String] input The string to tokenise.
|
18
|
+
# @return [Tokeniser]
|
19
|
+
def initialize(input)
|
20
|
+
@input = input
|
21
|
+
end
|
22
|
+
|
23
|
+
# Converts a string into an array of tokens using a regular expression.
|
24
|
+
# If a regexp is not provided a default one is used. See {Tokenizer.TOKEN_REGEXP}.
|
25
|
+
#
|
26
|
+
# Use `exclude` to remove tokens from the final list. `exclude` can be a string,
|
27
|
+
# a regular expression, a lambda, a symbol, or an array of one or more of those types.
|
28
|
+
# This allows for powerful and flexible tokenisation strategies.
|
29
|
+
#
|
30
|
+
# @example
|
31
|
+
# WordsCounted::Tokeniser.new("Hello World").tokenise
|
32
|
+
# # => ['hello', 'world']
|
33
|
+
#
|
34
|
+
# @example With `pattern`
|
35
|
+
# WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
|
36
|
+
# # => ['hello', 'mohamad']
|
37
|
+
#
|
38
|
+
# @example With `exclude` as a string
|
39
|
+
# WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
|
40
|
+
# # => ['sami']
|
41
|
+
#
|
42
|
+
# @example With `exclude` as a regexp
|
43
|
+
# WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
|
44
|
+
# # => ['dani']
|
45
|
+
#
|
46
|
+
# @example With `exclude` as a lambda
|
47
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(exclude: ->(token) { token.length > 6 })
|
48
|
+
# # => ['sami']
|
49
|
+
#
|
50
|
+
# @example With `exclude` as a symbol
|
51
|
+
# WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
|
52
|
+
# # => ['محمد']
|
53
|
+
#
|
54
|
+
# @example With `exclude` as an array of strings
|
55
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(exclude: ["goodbye hello"])
|
56
|
+
# # => ['sami', 'and', dani']
|
57
|
+
#
|
58
|
+
# @example With `exclude` as an array of regular expressions
|
59
|
+
# WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(exclude: [/goodbye/i, /and/i])
|
60
|
+
# # => ['hello', 'dani']
|
61
|
+
#
|
62
|
+
# @example With `exclude` as an array of lambdas
|
63
|
+
# t = WordsCounted::Tokeniser.new("Special Agent 007")
|
64
|
+
# t.tokenise(exclude: [->(t) { t.to_i.odd? }, ->(t) { t.length > 5}])
|
65
|
+
# # => ['agent']
|
66
|
+
#
|
67
|
+
# @example With `exclude` as a mixed array
|
68
|
+
# t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
|
69
|
+
# t.tokenise(exclude: [:ascii_only?, /محمد/, ->(t) { t.length > 6}, "و"])
|
70
|
+
# # => => ["هي", "سامي", "ودان
|
71
|
+
#
|
72
|
+
# @param [Regexp] pattern The string to tokenise.
|
73
|
+
# @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol nil] exclude The filter to apply.
|
74
|
+
# @return [Array] the array of filtered tokens.
|
75
|
+
def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
|
76
|
+
filter_proc = filter_to_proc(exclude)
|
77
|
+
@input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
# This method converts any arguments into a callable object. The return value of this
|
83
|
+
# is then used to determine whether a token should be excluded from the final list or not.
|
84
|
+
#
|
85
|
+
# `filter` can be a string, a regular expression, a lambda, a symbol, or an array
|
86
|
+
# of any combination of those types.
|
87
|
+
#
|
88
|
+
# If `filter` is a string, see {Tokeniser#filter_proc_from_string}.
|
89
|
+
# If `filter` is a an array, see {Tokeniser#filter_procs_from_array}.
|
90
|
+
#
|
91
|
+
# If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
|
92
|
+
# is returned that checks the token for a match. If a symbol is passed, it is converted to
|
93
|
+
# a proc.
|
94
|
+
#
|
95
|
+
# This method depends on `nil` responding `to_a` with an empty array, which
|
96
|
+
# avoids having to check if `exclude` was passed.
|
97
|
+
#
|
98
|
+
# @api private
|
99
|
+
def filter_to_proc(filter)
|
100
|
+
if filter.respond_to?(:to_a)
|
101
|
+
filter_procs_from_array(filter)
|
102
|
+
elsif filter.respond_to?(:to_str)
|
103
|
+
filter_proc_from_string(filter)
|
104
|
+
elsif regexp_filter = Regexp.try_convert(filter)
|
105
|
+
->(token) {
|
106
|
+
token =~ regexp_filter
|
107
|
+
}
|
108
|
+
elsif filter.respond_to?(:to_proc)
|
109
|
+
filter.to_proc
|
110
|
+
else
|
111
|
+
raise ArgumentError,
|
112
|
+
"`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Converts an array of `filters` to an array of lambdas, and returns a lambda that calls
|
117
|
+
# each lambda in the resulting array. If any lambda returns true the token is excluded
|
118
|
+
# from the final list.
|
119
|
+
#
|
120
|
+
# @api private
|
121
|
+
def filter_procs_from_array(filter)
|
122
|
+
filter_procs = Array(filter).map &method(:filter_to_proc)
|
123
|
+
->(token) {
|
124
|
+
filter_procs.any? { |pro| pro.call(token) }
|
125
|
+
}
|
126
|
+
end
|
127
|
+
|
128
|
+
# Converts a string `filter` to an array, and returns a lambda
|
129
|
+
# that returns true if the token is included in the array.
|
130
|
+
#
|
131
|
+
# @api private
|
132
|
+
def filter_proc_from_string(filter)
|
133
|
+
normalized_exclusion_list = filter.split.map(&:downcase)
|
134
|
+
->(token) {
|
135
|
+
normalized_exclusion_list.include?(token)
|
136
|
+
}
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
data/lib/words_counted.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require "
|
2
|
+
require "refinements/hash_refinements"
|
3
|
+
|
4
|
+
require "words_counted/tokeniser"
|
3
5
|
require "words_counted/counter"
|
6
|
+
require "words_counted/version"
|
4
7
|
|
5
8
|
begin
|
6
9
|
require "pry"
|
@@ -9,10 +12,14 @@ end
|
|
9
12
|
|
10
13
|
module WordsCounted
|
11
14
|
def self.count(string, options = {})
|
12
|
-
|
15
|
+
tokens = Tokeniser.new(string).tokenise(options)
|
16
|
+
Counter.new(tokens)
|
13
17
|
end
|
14
18
|
|
15
19
|
def self.from_file(path, options = {})
|
16
|
-
|
20
|
+
tokens = File.open(path) do |file|
|
21
|
+
Tokeniser.new(file.read).tokenise(options)
|
22
|
+
end
|
23
|
+
Counter.new(tokens)
|
17
24
|
end
|
18
25
|
end
|
@@ -3,240 +3,85 @@ require_relative "../spec_helper"
|
|
3
3
|
|
4
4
|
module WordsCounted
|
5
5
|
describe Counter do
|
6
|
-
let(:counter)
|
7
|
-
|
8
|
-
|
9
|
-
it "sets @options" do
|
10
|
-
expect(counter.instance_variables).to include(:@options)
|
11
|
-
end
|
12
|
-
|
13
|
-
it "sets @char_count" do
|
14
|
-
expect(counter.instance_variables).to include(:@char_count)
|
15
|
-
end
|
16
|
-
|
17
|
-
it "sets @words" do
|
18
|
-
expect(counter.instance_variables).to include(:@words)
|
19
|
-
end
|
20
|
-
|
21
|
-
it "sets @word_occurrences" do
|
22
|
-
expect(counter.instance_variables).to include(:@word_occurrences)
|
23
|
-
end
|
24
|
-
|
25
|
-
it "sets @word_lengths" do
|
26
|
-
expect(counter.instance_variables).to include(:@word_lengths)
|
27
|
-
end
|
6
|
+
let(:counter) do
|
7
|
+
tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
|
8
|
+
Counter.new(tokens)
|
28
9
|
end
|
29
10
|
|
30
|
-
describe "
|
31
|
-
it "
|
32
|
-
expect(counter.
|
33
|
-
end
|
34
|
-
|
35
|
-
it "splits words" do
|
36
|
-
expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
|
37
|
-
end
|
38
|
-
|
39
|
-
it "removes special characters" do
|
40
|
-
counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
|
41
|
-
expect(counter.words).to eq(%w[hello how do you do])
|
42
|
-
end
|
43
|
-
|
44
|
-
it "counts hyphenated words as one" do
|
45
|
-
counter = Counter.new("I am twenty-two.")
|
46
|
-
expect(counter.words).to eq(%w[i am twenty-two])
|
47
|
-
end
|
48
|
-
|
49
|
-
it "does not split words on apostrophe" do
|
50
|
-
counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
|
51
|
-
expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
|
52
|
-
end
|
53
|
-
|
54
|
-
it "does not split on unicode chars" do
|
55
|
-
counter = Counter.new("São Paulo")
|
56
|
-
expect(counter.words).to eq(%w[são paulo])
|
57
|
-
end
|
58
|
-
|
59
|
-
it "it accepts a string filter" do
|
60
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
|
61
|
-
expect(counter.words).to eq(%w[that was trevor])
|
62
|
-
end
|
63
|
-
|
64
|
-
it "it accepts a string filter with multiple words" do
|
65
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
|
66
|
-
expect(counter.words).to eq(%w[that trevor])
|
67
|
-
end
|
68
|
-
|
69
|
-
it "filters words in uppercase when using a string filter" do
|
70
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
|
71
|
-
expect(counter.words).to eq(%w[that was trevor])
|
72
|
-
end
|
73
|
-
|
74
|
-
it "accepts a regexp filter" do
|
75
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
|
76
|
-
expect(counter.words).to eq(%w[that was trevor])
|
77
|
-
end
|
78
|
-
|
79
|
-
it "accepts an array filter" do
|
80
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
|
81
|
-
expect(counter.words).to eq(%w[magnificent trevor])
|
82
|
-
end
|
83
|
-
|
84
|
-
it "accepts a lambda filter" do
|
85
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
|
86
|
-
expect(counter.words).to eq(%w[was magnificent trevor])
|
87
|
-
end
|
88
|
-
|
89
|
-
it "accepts a custom regexp" do
|
90
|
-
counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
|
91
|
-
expect(counter.words).to eq(["i", "am", "007"])
|
92
|
-
end
|
93
|
-
|
94
|
-
it "char_count should be calculated after the filter is applied" do
|
95
|
-
counter = Counter.new("I am Legend.", exclude: "I am")
|
96
|
-
expect(counter.char_count).to eq(6)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
describe "word_count" do
|
101
|
-
it "returns the correct word count" do
|
102
|
-
expect(counter.word_count).to eq(15)
|
11
|
+
describe "initialize" do
|
12
|
+
it "sets @tokens" do
|
13
|
+
expect(counter.instance_variables).to include(:@tokens)
|
103
14
|
end
|
104
15
|
end
|
105
16
|
|
106
|
-
describe "
|
107
|
-
it "returns
|
108
|
-
expect(counter.
|
109
|
-
end
|
110
|
-
|
111
|
-
it "treats capitalized words as the same word" do
|
112
|
-
counter = Counter.new("Bad, bad, piggy!")
|
113
|
-
expect(counter.word_occurrences).to eq({ "bad" => 2, "piggy" => 1 })
|
17
|
+
describe "#token_count" do
|
18
|
+
it "returns the correct number of tokens" do
|
19
|
+
expect(counter.token_count).to eq(6)
|
114
20
|
end
|
115
21
|
end
|
116
22
|
|
117
|
-
describe "
|
118
|
-
it "returns
|
119
|
-
expect(counter.
|
120
|
-
end
|
121
|
-
|
122
|
-
it "returns a two dimensional array sorted by descending word occurrence" do
|
123
|
-
counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
|
124
|
-
expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
|
23
|
+
describe "#uniq_token_count" do
|
24
|
+
it "returns the number of unique token" do
|
25
|
+
expect(counter.uniq_token_count).to eq(3)
|
125
26
|
end
|
126
27
|
end
|
127
28
|
|
128
|
-
describe "
|
129
|
-
it "returns
|
130
|
-
expect(counter.
|
131
|
-
end
|
132
|
-
|
133
|
-
it "returns highest occuring words" do
|
134
|
-
counter = Counter.new("Orange orange Apple apple banana")
|
135
|
-
expect(counter.most_occurring_words).to eq([["orange", 2],["apple", 2]])
|
29
|
+
describe "#char_count" do
|
30
|
+
it "returns the correct number of chars" do
|
31
|
+
expect(counter.char_count).to eq(26)
|
136
32
|
end
|
137
33
|
end
|
138
34
|
|
139
|
-
describe
|
140
|
-
it "returns a
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
counter = Counter.new("One two three.")
|
146
|
-
expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
|
35
|
+
describe "#token_frequency" do
|
36
|
+
it "returns a two-dimensional array where each member array is a token and its frequency in descending order" do
|
37
|
+
expected = [
|
38
|
+
['three', 3], ['woot', 2], ['one', 1]
|
39
|
+
]
|
40
|
+
expect(counter.token_frequency).to eq(expected)
|
147
41
|
end
|
148
42
|
end
|
149
43
|
|
150
|
-
describe "
|
151
|
-
it "returns
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
counter = Counter.new("I am not certain of that")
|
157
|
-
expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
|
44
|
+
describe "#token_lengths" do
|
45
|
+
it "returns a two-dimensional array where each member array is a token and its length in descending order" do
|
46
|
+
expected = [
|
47
|
+
['three', 5], ['woot', 4], ['one', 3]
|
48
|
+
]
|
49
|
+
expect(counter.token_lengths).to eq(expected)
|
158
50
|
end
|
159
51
|
end
|
160
52
|
|
161
|
-
describe "
|
162
|
-
it "returns
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
counter = Counter.new("Those whom the gods love grow young.")
|
168
|
-
expect(counter.longest_words).to eq([["those", 5],["young", 5]])
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
describe "word_density" do
|
173
|
-
it "returns an array" do
|
174
|
-
expect(counter.word_density).to be_a(Array)
|
175
|
-
end
|
176
|
-
|
177
|
-
it "returns words and their density in percent" do
|
178
|
-
counter = Counter.new("His name was Major, major Major Major.")
|
179
|
-
expect(counter.word_density).to eq([["major", 57.14], ["was", 14.29], ["name", 14.29], ["his", 14.29]])
|
53
|
+
describe "#token_density" do
|
54
|
+
it "returns a two-dimensional array where each member array is a token and its density in descending order" do
|
55
|
+
expected = [
|
56
|
+
['three', 0.5], ['woot', 0.33], ['one', 0.17]
|
57
|
+
]
|
58
|
+
expect(counter.token_density).to eq(expected)
|
180
59
|
end
|
181
60
|
|
182
61
|
it "accepts a precision" do
|
183
|
-
|
184
|
-
|
62
|
+
expected = [
|
63
|
+
['three', 0.5], ['woot', 0.3333], ['one', 0.1667]
|
64
|
+
]
|
65
|
+
expect(counter.token_density(precision: 4)).to eq(expected)
|
185
66
|
end
|
186
67
|
end
|
187
68
|
|
188
|
-
describe "
|
189
|
-
it "returns
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
it "returns the number of chars in the passed in string after the filter is applied" do
|
195
|
-
counter = Counter.new("His name was Major, major Major Major.", exclude: "Major")
|
196
|
-
expect(counter.char_count).to eq(10)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
describe "average_chars_per_word" do
|
201
|
-
it "returns the average number of chars per word" do
|
202
|
-
counter = Counter.new("His name was major, Major Major Major.")
|
203
|
-
expect(counter.average_chars_per_word).to eq(4.29)
|
204
|
-
end
|
205
|
-
|
206
|
-
it "returns the average number of chars per word after the filter is applied" do
|
207
|
-
counter = Counter.new("His name was Major, Major Major Major.", exclude: "Major")
|
208
|
-
expect(counter.average_chars_per_word).to eq(3.33)
|
209
|
-
end
|
210
|
-
|
211
|
-
it "accepts precision" do
|
212
|
-
counter = Counter.new("This line should have 39 characters minus spaces.")
|
213
|
-
expect(counter.average_chars_per_word(4)).to eq(5.5714)
|
69
|
+
describe "#most_frequent_tokens" do
|
70
|
+
it "returns a hash of the tokens with the highest frequency, where each key a token, and each value is its frequency" do
|
71
|
+
expected = {
|
72
|
+
'three' => 3
|
73
|
+
}
|
74
|
+
expect(counter.most_frequent_tokens).to eq(expected)
|
214
75
|
end
|
215
76
|
end
|
216
77
|
|
217
|
-
describe "
|
218
|
-
it "returns
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
counter = Counter.new("Up down. Down up.")
|
224
|
-
expect(counter.unique_word_count).to eq(2)
|
78
|
+
describe "#longest_tokens" do
|
79
|
+
it "returns a hash of the tokens with the highest length, where each key a token, and each value is its length" do
|
80
|
+
expected = {
|
81
|
+
'three' => 5
|
82
|
+
}
|
83
|
+
expect(counter.longest_tokens).to eq(expected)
|
225
84
|
end
|
226
85
|
end
|
227
86
|
end
|
228
|
-
|
229
|
-
describe "count" do
|
230
|
-
it "returns count for a single word" do
|
231
|
-
counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
|
232
|
-
expect(counter.count("i")).to eq(3)
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
describe "from_file" do
|
237
|
-
it "opens and reads a text file" do
|
238
|
-
counter = WordsCounted.from_file('spec/support/the_hart_and_the_hunter.txt')
|
239
|
-
expect(counter.word_count).to eq(139)
|
240
|
-
end
|
241
|
-
end
|
242
87
|
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "../spec_helper"
|
3
|
+
|
4
|
+
module WordsCounted
|
5
|
+
warn "Methods being tested are deprecated"
|
6
|
+
|
7
|
+
describe Counter do
|
8
|
+
let(:counter) do
|
9
|
+
tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
|
10
|
+
Counter.new(tokens)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#word_density" do
|
14
|
+
it "returns words and their density in percent" do
|
15
|
+
expected = [
|
16
|
+
['three', 50.0], ['woot', 33.33], ['one', 16.67]
|
17
|
+
]
|
18
|
+
expect(counter.word_density).to eq(expected)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "accepts a precision" do
|
22
|
+
expected = [
|
23
|
+
['three', 50.0], ['woot', 33.3333], ['one', 16.6667]
|
24
|
+
]
|
25
|
+
expect(counter.word_density(4)).to eq(expected)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#word_occurrences" do
|
30
|
+
it "returns a two dimensional array sorted by descending word occurrence" do
|
31
|
+
expected = {
|
32
|
+
'three' => 3, 'woot' => 2, 'one' => 1
|
33
|
+
}
|
34
|
+
expect(counter.word_occurrences).to eq(expected)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#sorted_word_occurrences" do
|
39
|
+
it "returns a two dimensional array sorted by descending word occurrence" do
|
40
|
+
expected = [
|
41
|
+
['three', 3], ['woot', 2], ['one', 1]
|
42
|
+
]
|
43
|
+
expect(counter.sorted_word_occurrences).to eq(expected)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "#word_lengths" do
|
48
|
+
it "returns a hash of of words and their length sorted descending by length" do
|
49
|
+
expected = {
|
50
|
+
'three' => 5, 'woot' => 4, 'one' => 3
|
51
|
+
}
|
52
|
+
expect(counter.word_lengths).to eq(expected)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "#sorted_word_lengths" do
|
57
|
+
it "returns a two dimensional array sorted by descending word length" do
|
58
|
+
expected = [
|
59
|
+
['three', 5], ['woot', 4], ['one', 3]
|
60
|
+
]
|
61
|
+
expect(counter.sorted_word_lengths).to eq(expected)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#longest_words" do
|
66
|
+
it "returns a two-dimentional array of the longest words and their lengths" do
|
67
|
+
expected = [
|
68
|
+
['three', 5]
|
69
|
+
]
|
70
|
+
expect(counter.longest_words).to eq(expected)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "#most_occurring_words" do
|
75
|
+
it "returns a two-dimentional array of words with the highest frequency and their frequencies" do
|
76
|
+
expected = [
|
77
|
+
['three', 3]
|
78
|
+
]
|
79
|
+
expect(counter.most_occurring_words).to eq(expected)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "#average_chars_per_word" do
|
84
|
+
it "returns the average number of chars per word" do
|
85
|
+
expect(counter.average_chars_per_word).to eq(4.33)
|
86
|
+
end
|
87
|
+
|
88
|
+
it "accepts precision" do
|
89
|
+
expect(counter.average_chars_per_word(4)).to eq(4.3333)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "#count" do
|
94
|
+
it "returns count for a single word" do
|
95
|
+
expect(counter.count('one')).to eq(1)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "../spec_helper"
|
3
|
+
|
4
|
+
module WordsCounted
|
5
|
+
describe Tokeniser do
|
6
|
+
describe "initialize" do
|
7
|
+
it "sets @input" do
|
8
|
+
tokeniser = Tokeniser.new("Hello World!")
|
9
|
+
expect(tokeniser.instance_variables).to include(:@input)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#tokenise" do
|
14
|
+
it "normalises tokens and returns an array" do
|
15
|
+
tokens = Tokeniser.new("Hello HELLO").tokenise
|
16
|
+
expect(tokens).to eq(%w[hello hello])
|
17
|
+
end
|
18
|
+
|
19
|
+
context "without arguments" do
|
20
|
+
it "removes none alpha-numeric chars" do
|
21
|
+
tokens = Tokeniser.new("Hello world! # $ % 12345 * & % ?").tokenise
|
22
|
+
expect(tokens).to eq(%w[hello world])
|
23
|
+
end
|
24
|
+
|
25
|
+
it "does not split on hyphens" do
|
26
|
+
tokens = Tokeniser.new("I am twenty-two.").tokenise
|
27
|
+
expect(tokens).to eq(%w[i am twenty-two])
|
28
|
+
end
|
29
|
+
|
30
|
+
it "does not split on apostrophe" do
|
31
|
+
tokens = Tokeniser.new("Bust 'em! It's Jim's gang.").tokenise
|
32
|
+
expect(tokens).to eq(%w[bust 'em it's jim's gang])
|
33
|
+
end
|
34
|
+
|
35
|
+
it "does not split on unicode chars" do
|
36
|
+
tokens = Tokeniser.new("Bayrūt").tokenise
|
37
|
+
expect(tokens).to eq(%w[bayrūt])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
context "with `pattern` options" do
|
42
|
+
it "splits on accepts a custom pattern" do
|
43
|
+
tokens = Tokeniser.new("We-Are-ALL").tokenise(pattern: /[^-]+/)
|
44
|
+
expect(tokens).to eq(%w[we are all])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "with `exclude` option" do
|
49
|
+
context "as a string" do
|
50
|
+
let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
|
51
|
+
|
52
|
+
it "it accepts a string filter" do
|
53
|
+
tokens = tokeniser.tokenise(exclude: "magnificent")
|
54
|
+
expect(tokens).to eq(%w[that was trevor])
|
55
|
+
end
|
56
|
+
|
57
|
+
it "accepts a string filter with multiple space-delimited tokens" do
|
58
|
+
tokens = tokeniser.tokenise(exclude: "was magnificent")
|
59
|
+
expect(tokens).to eq(%w[that trevor])
|
60
|
+
end
|
61
|
+
|
62
|
+
it "normalises string filter" do
|
63
|
+
tokens = tokeniser.tokenise(exclude: "MAGNIFICENT")
|
64
|
+
expect(tokens).to eq(%w[that was trevor])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "as a regular expression" do
|
69
|
+
it "filters on match" do
|
70
|
+
tokeniser = Tokeniser.new("That was magnificent, Trevor.")
|
71
|
+
tokens = tokeniser.tokenise(exclude: /magnificent/i)
|
72
|
+
expect(tokens).to eq(%w[that was trevor])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "as a lambda" do
|
77
|
+
it "calls lambda" do
|
78
|
+
tokeniser = Tokeniser.new("That was magnificent, Trevor.")
|
79
|
+
tokens = tokeniser.tokenise(exclude: ->(token) { token.length < 5 })
|
80
|
+
expect(tokens).to eq(%w[magnificent trevor])
|
81
|
+
end
|
82
|
+
|
83
|
+
it "accepts a symbol for shorthand notation" do
|
84
|
+
tokeniser = Tokeniser.new("That was magnificent, محمد.}")
|
85
|
+
tokens = tokeniser.tokenise(exclude: :ascii_only?)
|
86
|
+
expect(tokens).to eq(%w[محمد])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
context "as an array" do
|
91
|
+
let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
|
92
|
+
|
93
|
+
it "accepts an array of strings" do
|
94
|
+
tokens = tokeniser.tokenise(exclude: ["magnificent"])
|
95
|
+
expect(tokens).to eq(%w[that was trevor])
|
96
|
+
end
|
97
|
+
|
98
|
+
it "accepts an array regular expressions" do
|
99
|
+
tokens = tokeniser.tokenise(exclude: [/that/, /was/])
|
100
|
+
expect(tokens).to eq(%w[magnificent trevor])
|
101
|
+
end
|
102
|
+
|
103
|
+
it "accepts an array of lambdas" do
|
104
|
+
filters = [
|
105
|
+
->(token) { token.length < 4 },
|
106
|
+
->(token) { token.length > 6 }
|
107
|
+
]
|
108
|
+
tokens = tokeniser.tokenise(exclude: filters)
|
109
|
+
expect(tokens).to eq(%w[that trevor])
|
110
|
+
end
|
111
|
+
|
112
|
+
it "accepts a mixed array" do
|
113
|
+
filters = [
|
114
|
+
"that",
|
115
|
+
->(token) { token.length < 4 },
|
116
|
+
/magnificent/
|
117
|
+
]
|
118
|
+
tokens = tokeniser.tokenise(exclude: filters)
|
119
|
+
expect(tokens).to eq(["trevor"])
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
context "with an invalid filter" do
|
124
|
+
it "raises an `ArgumentError`" do
|
125
|
+
expect {
|
126
|
+
Tokeniser.new("Hello world!").tokenise(exclude: 1)
|
127
|
+
}.to raise_error(ArgumentError)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "spec_helper"
|
3
|
+
|
4
|
+
describe WordsCounted do
|
5
|
+
describe ".from_file" do
|
6
|
+
let(:file_path) { "spec/support/the_hart_and_the_hunter.txt" }
|
7
|
+
|
8
|
+
it "opens and reads a text file" do
|
9
|
+
counter = WordsCounted.from_file(file_path)
|
10
|
+
expect(counter.token_count).to eq(139)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "opens and reads a text file with options" do
|
14
|
+
counter = WordsCounted.from_file(file_path, exclude: "hunter")
|
15
|
+
expect(counter.token_count).to eq(135)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe ".count" do
|
20
|
+
let(:string) do
|
21
|
+
"We are all in the gutter, but some of us are looking at the stars."
|
22
|
+
end
|
23
|
+
|
24
|
+
it "returns a counter instance with given input as tokens" do
|
25
|
+
counter = WordsCounted.count(string)
|
26
|
+
expect(counter.token_count).to eq(15)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "returns a counter instance with given input and options" do
|
30
|
+
counter = WordsCounted.count(string, exclude: "the gutter")
|
31
|
+
expect(counter.token_count).to eq(12)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/words_counted.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.version = WordsCounted::VERSION
|
10
10
|
spec.authors = ["Mohamad El-Husseini"]
|
11
11
|
spec.email = ["husseini.mel@gmail.com"]
|
12
|
-
spec.description = %q{A Ruby
|
12
|
+
spec.description = %q{A Ruby natural language processor to extract stats from text, such was word count and more.}
|
13
13
|
spec.summary = %q{See README.}
|
14
14
|
spec.homepage = "https://github.com/abitdodgy/words_counted"
|
15
15
|
spec.license = "MIT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words_counted
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mohamad El-Husseini
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,7 +66,8 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
description: A Ruby
|
69
|
+
description: A Ruby natural language processor to extract stats from text, such was
|
70
|
+
word count and more.
|
70
71
|
email:
|
71
72
|
- husseini.mel@gmail.com
|
72
73
|
executables: []
|
@@ -74,19 +75,28 @@ extensions: []
|
|
74
75
|
extra_rdoc_files: []
|
75
76
|
files:
|
76
77
|
- ".gitignore"
|
78
|
+
- ".hound.yml"
|
77
79
|
- ".rspec"
|
80
|
+
- ".ruby-style.yml"
|
81
|
+
- ".travis.yml"
|
78
82
|
- ".yardopts"
|
79
83
|
- CHANGELOG.md
|
80
84
|
- Gemfile
|
81
85
|
- LICENSE.txt
|
82
86
|
- README.md
|
83
87
|
- Rakefile
|
88
|
+
- lib/refinements/hash_refinements.rb
|
84
89
|
- lib/words_counted.rb
|
85
90
|
- lib/words_counted/counter.rb
|
91
|
+
- lib/words_counted/deprecated.rb
|
92
|
+
- lib/words_counted/tokeniser.rb
|
86
93
|
- lib/words_counted/version.rb
|
87
94
|
- spec/spec_helper.rb
|
88
95
|
- spec/support/the_hart_and_the_hunter.txt
|
89
96
|
- spec/words_counted/counter_spec.rb
|
97
|
+
- spec/words_counted/deprecated_spec.rb
|
98
|
+
- spec/words_counted/tokeniser_spec.rb
|
99
|
+
- spec/words_counted_spec.rb
|
90
100
|
- words_counted.gemspec
|
91
101
|
homepage: https://github.com/abitdodgy/words_counted
|
92
102
|
licenses:
|
@@ -108,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
118
|
version: '0'
|
109
119
|
requirements: []
|
110
120
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
121
|
+
rubygems_version: 2.4.5
|
112
122
|
signing_key:
|
113
123
|
specification_version: 4
|
114
124
|
summary: See README.
|
@@ -116,3 +126,6 @@ test_files:
|
|
116
126
|
- spec/spec_helper.rb
|
117
127
|
- spec/support/the_hart_and_the_hunter.txt
|
118
128
|
- spec/words_counted/counter_spec.rb
|
129
|
+
- spec/words_counted/deprecated_spec.rb
|
130
|
+
- spec/words_counted/tokeniser_spec.rb
|
131
|
+
- spec/words_counted_spec.rb
|