words_counted 0.1.5 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.hound.yml +2 -0
- data/.ruby-style.yml +2 -0
- data/.travis.yml +9 -0
- data/.yardopts +3 -2
- data/CHANGELOG.md +24 -0
- data/lib/refinements/hash_refinements.rb +10 -0
- data/lib/words_counted/counter.rb +101 -69
- data/lib/words_counted/deprecated.rb +76 -0
- data/lib/words_counted/tokeniser.rb +139 -0
- data/lib/words_counted/version.rb +1 -1
- data/lib/words_counted.rb +10 -3
- data/spec/words_counted/counter_spec.rb +49 -204
- data/spec/words_counted/deprecated_spec.rb +99 -0
- data/spec/words_counted/tokeniser_spec.rb +133 -0
- data/spec/words_counted_spec.rb +34 -0
- data/words_counted.gemspec +1 -1
- metadata +17 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6302c1802d7da076d1ddafdcbe70e46a89c8f33
|
4
|
+
data.tar.gz: 873efaa5e58f883e0dde99094ca53952d46217c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0e6ddb8db9c060432066d86aed2efe20aa95dee2019d54c950007170c0ffbbcff16fa27a0377419b0d1b718be1625a4376ee9c687a4ae67073aaffe9ef363157
|
7
|
+
data.tar.gz: 9df2a0cefe14b9ac77d1741f8980d1b1fb4d8b770738fbd69c8870f73da4b653a1d9462ac8813f88dc48af36e03718773523985f5be0f4999177a6b0a2a89662
|
data/.gitignore
CHANGED
data/.hound.yml
ADDED
data/.ruby-style.yml
ADDED
data/.travis.yml
ADDED
data/.yardopts
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,27 @@
|
|
1
|
+
## Version 1.0
|
2
|
+
|
3
|
+
This version brings lots of improvements to code organisation. The tokeniser has been extracted into its own class. All methods in `Counter` have either renamed or deprecated. Deprecated methods and their tests have moved into their own modules. Using them will trigger warnings with upgrade instructions outlined below.
|
4
|
+
|
5
|
+
1. Extracted tokenisation behaviour from `Counter` into a `Tokeniser` class.
|
6
|
+
2. Deprecated all methods that have `word` in their name. Most are renamed such that `word` became `token`. They will be removed in version 1.1.
|
7
|
+
- Deprecated `word_count` in favor of `token_count`
|
8
|
+
- Deprecated `unique_word_count` in favor of `unique_token_count`
|
9
|
+
- Deprecated `word_occurrences` and `sorted_word_occurrences` in favor of `token_frequency`
|
10
|
+
- Deprecated `word_lengths` and `sorted_word_lengths` in favor of `token_lenghts`
|
11
|
+
- Deprecated `word_density` in favor of `token_density`
|
12
|
+
- Deprecated `most_occurring_words` in favor of `most_frequent_tokens`
|
13
|
+
- Deprecated `longest_words` in favor of `longest_tokens`
|
14
|
+
- Deprecated `average_chars_per_word` in favor of `average_chars_per_token`
|
15
|
+
- Deprecated `count`. Use `Array#count` instead.
|
16
|
+
3. `token_lengths`, which replaces `word_lengths` returns a sorted two-dimensional array instead of a hash. It behaves exactly like `sorted_word_lengths` which has been deprecated. Use `token_lengths.to_h` for old behaviour.
|
17
|
+
4. `token_frequency`, which replaces `word_occurences` returns a sorted two-dimensional array instead of a hash. It behaves like `sorted_word_occurrences` which has been deprecated. Use `token_frequency.to_h` for old behaviour.
|
18
|
+
5. `token_density`, which replaces `word_density`, returns a decimal with a precision of 2, not a percent. Use `token_density * 100` for old behaviour.
|
19
|
+
6. Add a refinement to Hash under `lib/refinements/hash_refinements.rb` to quickly sort by descending value.
|
20
|
+
7. Extracted all deprecated methods to their own module, and their tests to their own spec file.
|
21
|
+
8. Added a base `words_counted_spec.rb` and moved `.from_file` test to the new file.
|
22
|
+
9. Added Travis continuous integration.
|
23
|
+
10. Add documentation to the code.
|
24
|
+
|
1
25
|
## Version 0.1.5
|
2
26
|
|
3
27
|
1. Removed `to_f` from the dividend in `average_chars_per_word` and `word_densities`. The divisor is a float, and dividing by a float returns a float.
|
@@ -1,96 +1,128 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
module WordsCounted
|
3
|
-
class Counter
|
4
|
-
attr_reader :words, :word_occurrences, :word_lengths, :char_count
|
5
|
-
|
6
|
-
WORD_REGEXP = /[\p{Alpha}\-']+/
|
7
2
|
|
8
|
-
|
9
|
-
File.open(path) do |file|
|
10
|
-
new file.read, options
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(string, options = {})
|
15
|
-
@options = options
|
16
|
-
exclude = filter_proc(options[:exclude])
|
17
|
-
@words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
|
18
|
-
@char_count = words.join.size
|
19
|
-
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
|
20
|
-
@word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
|
21
|
-
end
|
3
|
+
require "words_counted/deprecated"
|
22
4
|
|
23
|
-
|
24
|
-
|
25
|
-
end
|
5
|
+
module WordsCounted
|
6
|
+
using Refinements::HashRefinements
|
26
7
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
8
|
+
class Counter
|
9
|
+
include Deprecated
|
30
10
|
|
31
|
-
|
32
|
-
(char_count / word_count.to_f).round(precision)
|
33
|
-
end
|
11
|
+
attr_reader :tokens
|
34
12
|
|
35
|
-
def
|
36
|
-
|
13
|
+
def initialize(tokens)
|
14
|
+
@tokens = tokens
|
37
15
|
end
|
38
16
|
|
39
|
-
|
40
|
-
|
17
|
+
# Returns the number of tokens.
|
18
|
+
#
|
19
|
+
# @example
|
20
|
+
# Counter.new(%w[one two two three three three]).token_count
|
21
|
+
# # => 6
|
22
|
+
#
|
23
|
+
# @return [Integer] The number of tokens.
|
24
|
+
def token_count
|
25
|
+
tokens.size
|
41
26
|
end
|
42
27
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
28
|
+
# Returns the number of unique tokens.
|
29
|
+
#
|
30
|
+
# @example
|
31
|
+
# Counter.new(%w[one two two three three three]).uniq_token_count
|
32
|
+
# # => 3
|
33
|
+
#
|
34
|
+
# @return [Integer] The number of unique tokens.
|
35
|
+
def uniq_token_count
|
36
|
+
tokens.uniq.size
|
48
37
|
end
|
49
38
|
|
50
|
-
|
51
|
-
|
39
|
+
# Returns the character count of all tokens.
|
40
|
+
#
|
41
|
+
# @example
|
42
|
+
# Counter.new(%w[one two]).char_count
|
43
|
+
# # => 6
|
44
|
+
#
|
45
|
+
# @return [Integer] The total char count of tokens.
|
46
|
+
def char_count
|
47
|
+
tokens.join.size
|
52
48
|
end
|
53
49
|
|
54
|
-
|
55
|
-
|
50
|
+
# Returns a sorted two-dimensional array where each member array is a token and its frequency.
|
51
|
+
# The array is sorted by frequency in descending order.
|
52
|
+
#
|
53
|
+
# @example
|
54
|
+
# Counter.new(%w[one two two three three three]).token_frequency
|
55
|
+
# # => [ ['three', 3], ['two', 2], ['one', 1] ]
|
56
|
+
#
|
57
|
+
# @return [Array<Array<String, Integer>>]
|
58
|
+
def token_frequency
|
59
|
+
tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
|
56
60
|
end
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
# Returns a sorted two-dimensional array where each member array is a token and its length.
|
63
|
+
# The array is sorted by length in descending order.
|
64
|
+
#
|
65
|
+
# @example
|
66
|
+
# Counter.new(%w[one two three four five]).token_lenghts
|
67
|
+
# # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
|
68
|
+
#
|
69
|
+
# @return [Array<Array<String, Integer>>]
|
70
|
+
def token_lengths
|
71
|
+
tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
|
60
72
|
end
|
61
73
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
74
|
+
# Returns a sorted two-dimensional array where each member array is a token and its density
|
75
|
+
# as a float, rounded to a precision of two decimal places. It accepts a precision argument
|
76
|
+
# which defaults to `2`.
|
77
|
+
#
|
78
|
+
# @example
|
79
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density
|
80
|
+
# # => [ ['major', .75], ['maj', .25] ]
|
81
|
+
#
|
82
|
+
# @example with `precision`
|
83
|
+
# Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
|
84
|
+
# # => [ ['major', .7500], ['maj', .2500] ]
|
85
|
+
#
|
86
|
+
# @param [Integer] precision The number of decimal places to round density to.
|
87
|
+
# @return [Array<Array<String, Float>>]
|
88
|
+
def token_density(precision: 2)
|
89
|
+
token_frequency.each_with_object({}) { |(token, freq), hash|
|
90
|
+
hash[token] = (freq / token_count.to_f).round(precision)
|
91
|
+
}.sort_by_value_desc
|
66
92
|
end
|
67
93
|
|
68
|
-
|
69
|
-
|
94
|
+
# Returns a hash of tokens and their frequencies for tokens with the highest frequency.
|
95
|
+
#
|
96
|
+
# @example
|
97
|
+
# Counter.new(%w[one once two two twice twice]).most_frequent_tokens
|
98
|
+
# # => { 'two' => 2, 'twice' => 2 }
|
99
|
+
#
|
100
|
+
# @return [Hash<String, Integer>]
|
101
|
+
def most_frequent_tokens
|
102
|
+
token_frequency.group_by(&:last).max.last.to_h
|
70
103
|
end
|
71
104
|
|
72
|
-
|
73
|
-
|
105
|
+
# Returns a hash of tokens and their lengths for tokens with the highest length.
|
106
|
+
#
|
107
|
+
# @example
|
108
|
+
# Counter.new(%w[one three five seven]).longest_tokens
|
109
|
+
# # => { 'three' => 5, 'seven' => 5 }
|
110
|
+
#
|
111
|
+
# @return [Hash<String, Integer>]
|
112
|
+
def longest_tokens
|
113
|
+
token_lengths.group_by(&:last).max.last.to_h
|
74
114
|
end
|
75
115
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
}
|
87
|
-
elsif regexp_filter = Regexp.try_convert(filter)
|
88
|
-
Proc.new { |word| word =~ regexp_filter }
|
89
|
-
elsif filter.respond_to?(:to_proc)
|
90
|
-
filter.to_proc
|
91
|
-
else
|
92
|
-
raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
|
93
|
-
end
|
116
|
+
# Returns the average char count per token rounded to a precision of two decimal places.
|
117
|
+
# Accepts a `precision` argument.
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# Counter.new(%w[one three five seven]).average_chars_per_token
|
121
|
+
# # => 4.25
|
122
|
+
#
|
123
|
+
# @return [Float] The average char count per token.
|
124
|
+
def average_chars_per_token(precision: 2)
|
125
|
+
(char_count / token_count.to_f).round(precision)
|
94
126
|
end
|
95
127
|
end
|
96
128
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
module Deprecated
|
4
|
+
# @deprecated use `Counter#token_count`
|
5
|
+
def word_count
|
6
|
+
warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
|
7
|
+
token_count
|
8
|
+
end
|
9
|
+
|
10
|
+
# @deprecated use `Counter#uniq_token_count`
|
11
|
+
def unique_word_count
|
12
|
+
warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
|
13
|
+
uniq_token_count
|
14
|
+
end
|
15
|
+
|
16
|
+
# @deprecated use `Counter#token_frequency`
|
17
|
+
def word_occurrences
|
18
|
+
warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
19
|
+
warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
|
20
|
+
token_frequency.to_h
|
21
|
+
end
|
22
|
+
|
23
|
+
# @deprecated use `Counter#token_lengths`
|
24
|
+
def word_lengths
|
25
|
+
warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
|
26
|
+
warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
|
27
|
+
token_lengths.to_h
|
28
|
+
end
|
29
|
+
|
30
|
+
# @deprecated use `Counter#token_density`
|
31
|
+
def word_density(precision = 2)
|
32
|
+
warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
|
33
|
+
warn "`Counter#token_density` returns density as decimal and not percent"
|
34
|
+
|
35
|
+
token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
|
36
|
+
end
|
37
|
+
|
38
|
+
# @deprecated use `Counter#token_frequency`
|
39
|
+
def sorted_word_occurrences
|
40
|
+
warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
|
41
|
+
token_frequency
|
42
|
+
end
|
43
|
+
|
44
|
+
# @deprecated use `Counter#token_lengths`
|
45
|
+
def sorted_word_lengths
|
46
|
+
warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
|
47
|
+
token_lengths
|
48
|
+
end
|
49
|
+
|
50
|
+
# @deprecated use `Counter#most_frequent_tokens`
|
51
|
+
def most_occurring_words
|
52
|
+
warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
|
53
|
+
warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
|
54
|
+
most_frequent_tokens.to_a
|
55
|
+
end
|
56
|
+
|
57
|
+
# @deprecated use `Counter#longest_tokens`
|
58
|
+
def longest_words
|
59
|
+
warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
|
60
|
+
warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
|
61
|
+
longest_tokens.to_a
|
62
|
+
end
|
63
|
+
|
64
|
+
# @deprecated use `Counter#average_chars_per_token`
|
65
|
+
def average_chars_per_word(precision = 2)
|
66
|
+
warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
|
67
|
+
average_chars_per_token(precision: precision)
|
68
|
+
end
|
69
|
+
|
70
|
+
# @deprecated use `Counter#average_chars_per_token`
|
71
|
+
def count(token)
|
72
|
+
warn "`Counter#count` is deprecated, please use `Array#count`"
|
73
|
+
tokens.count(token.downcase)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module WordsCounted
|
3
|
+
class Tokeniser
|
4
|
+
# Takes a string and breaks it into an array of tokens.
|
5
|
+
# Using `pattern` and `exclude` allows for powerful tokenisation strategies.
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# tokeniser = WordsCounted::Tokeniser.new("We are all in the gutter, but some of us are looking at the stars.")
|
9
|
+
# tokeniser.tokenise(exclude: "We are all in the gutter")
|
10
|
+
# # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
|
11
|
+
|
12
|
+
# Default tokenisation strategy
|
13
|
+
TOKEN_REGEXP = /[\p{Alpha}\-']+/
|
14
|
+
|
15
|
+
# Initialises state with a string that will be tokenised.
|
16
|
+
#
|
17
|
+
# @param [String] input The string to tokenise.
|
18
|
+
# @return [Tokeniser]
|
19
|
+
def initialize(input)
|
20
|
+
@input = input
|
21
|
+
end
|
22
|
+
|
23
|
+
# Converts a string into an array of tokens using a regular expression.
|
24
|
+
# If a regexp is not provided a default one is used. See {Tokenizer.TOKEN_REGEXP}.
|
25
|
+
#
|
26
|
+
# Use `exclude` to remove tokens from the final list. `exclude` can be a string,
|
27
|
+
# a regular expression, a lambda, a symbol, or an array of one or more of those types.
|
28
|
+
# This allows for powerful and flexible tokenisation strategies.
|
29
|
+
#
|
30
|
+
# @example
|
31
|
+
# WordsCounted::Tokeniser.new("Hello World").tokenise
|
32
|
+
# # => ['hello', 'world']
|
33
|
+
#
|
34
|
+
# @example With `pattern`
|
35
|
+
# WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
|
36
|
+
# # => ['hello', 'mohamad']
|
37
|
+
#
|
38
|
+
# @example With `exclude` as a string
|
39
|
+
# WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
|
40
|
+
# # => ['sami']
|
41
|
+
#
|
42
|
+
# @example With `exclude` as a regexp
|
43
|
+
# WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
|
44
|
+
# # => ['dani']
|
45
|
+
#
|
46
|
+
# @example With `exclude` as a lambda
|
47
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(exclude: ->(token) { token.length > 6 })
|
48
|
+
# # => ['sami']
|
49
|
+
#
|
50
|
+
# @example With `exclude` as a symbol
|
51
|
+
# WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
|
52
|
+
# # => ['محمد']
|
53
|
+
#
|
54
|
+
# @example With `exclude` as an array of strings
|
55
|
+
# WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(exclude: ["goodbye hello"])
|
56
|
+
# # => ['sami', 'and', dani']
|
57
|
+
#
|
58
|
+
# @example With `exclude` as an array of regular expressions
|
59
|
+
# WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(exclude: [/goodbye/i, /and/i])
|
60
|
+
# # => ['hello', 'dani']
|
61
|
+
#
|
62
|
+
# @example With `exclude` as an array of lambdas
|
63
|
+
# t = WordsCounted::Tokeniser.new("Special Agent 007")
|
64
|
+
# t.tokenise(exclude: [->(t) { t.to_i.odd? }, ->(t) { t.length > 5}])
|
65
|
+
# # => ['agent']
|
66
|
+
#
|
67
|
+
# @example With `exclude` as a mixed array
|
68
|
+
# t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
|
69
|
+
# t.tokenise(exclude: [:ascii_only?, /محمد/, ->(t) { t.length > 6}, "و"])
|
70
|
+
# # => => ["هي", "سامي", "ودان
|
71
|
+
#
|
72
|
+
# @param [Regexp] pattern The string to tokenise.
|
73
|
+
# @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol nil] exclude The filter to apply.
|
74
|
+
# @return [Array] the array of filtered tokens.
|
75
|
+
def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
|
76
|
+
filter_proc = filter_to_proc(exclude)
|
77
|
+
@input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
# This method converts any arguments into a callable object. The return value of this
|
83
|
+
# is then used to determine whether a token should be excluded from the final list or not.
|
84
|
+
#
|
85
|
+
# `filter` can be a string, a regular expression, a lambda, a symbol, or an array
|
86
|
+
# of any combination of those types.
|
87
|
+
#
|
88
|
+
# If `filter` is a string, see {Tokeniser#filter_proc_from_string}.
|
89
|
+
# If `filter` is a an array, see {Tokeniser#filter_procs_from_array}.
|
90
|
+
#
|
91
|
+
# If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
|
92
|
+
# is returned that checks the token for a match. If a symbol is passed, it is converted to
|
93
|
+
# a proc.
|
94
|
+
#
|
95
|
+
# This method depends on `nil` responding `to_a` with an empty array, which
|
96
|
+
# avoids having to check if `exclude` was passed.
|
97
|
+
#
|
98
|
+
# @api private
|
99
|
+
def filter_to_proc(filter)
|
100
|
+
if filter.respond_to?(:to_a)
|
101
|
+
filter_procs_from_array(filter)
|
102
|
+
elsif filter.respond_to?(:to_str)
|
103
|
+
filter_proc_from_string(filter)
|
104
|
+
elsif regexp_filter = Regexp.try_convert(filter)
|
105
|
+
->(token) {
|
106
|
+
token =~ regexp_filter
|
107
|
+
}
|
108
|
+
elsif filter.respond_to?(:to_proc)
|
109
|
+
filter.to_proc
|
110
|
+
else
|
111
|
+
raise ArgumentError,
|
112
|
+
"`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Converts an array of `filters` to an array of lambdas, and returns a lambda that calls
|
117
|
+
# each lambda in the resulting array. If any lambda returns true the token is excluded
|
118
|
+
# from the final list.
|
119
|
+
#
|
120
|
+
# @api private
|
121
|
+
def filter_procs_from_array(filter)
|
122
|
+
filter_procs = Array(filter).map &method(:filter_to_proc)
|
123
|
+
->(token) {
|
124
|
+
filter_procs.any? { |pro| pro.call(token) }
|
125
|
+
}
|
126
|
+
end
|
127
|
+
|
128
|
+
# Converts a string `filter` to an array, and returns a lambda
|
129
|
+
# that returns true if the token is included in the array.
|
130
|
+
#
|
131
|
+
# @api private
|
132
|
+
def filter_proc_from_string(filter)
|
133
|
+
normalized_exclusion_list = filter.split.map(&:downcase)
|
134
|
+
->(token) {
|
135
|
+
normalized_exclusion_list.include?(token)
|
136
|
+
}
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
data/lib/words_counted.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require "
|
2
|
+
require "refinements/hash_refinements"
|
3
|
+
|
4
|
+
require "words_counted/tokeniser"
|
3
5
|
require "words_counted/counter"
|
6
|
+
require "words_counted/version"
|
4
7
|
|
5
8
|
begin
|
6
9
|
require "pry"
|
@@ -9,10 +12,14 @@ end
|
|
9
12
|
|
10
13
|
module WordsCounted
|
11
14
|
def self.count(string, options = {})
|
12
|
-
|
15
|
+
tokens = Tokeniser.new(string).tokenise(options)
|
16
|
+
Counter.new(tokens)
|
13
17
|
end
|
14
18
|
|
15
19
|
def self.from_file(path, options = {})
|
16
|
-
|
20
|
+
tokens = File.open(path) do |file|
|
21
|
+
Tokeniser.new(file.read).tokenise(options)
|
22
|
+
end
|
23
|
+
Counter.new(tokens)
|
17
24
|
end
|
18
25
|
end
|
@@ -3,240 +3,85 @@ require_relative "../spec_helper"
|
|
3
3
|
|
4
4
|
module WordsCounted
|
5
5
|
describe Counter do
|
6
|
-
let(:counter)
|
7
|
-
|
8
|
-
|
9
|
-
it "sets @options" do
|
10
|
-
expect(counter.instance_variables).to include(:@options)
|
11
|
-
end
|
12
|
-
|
13
|
-
it "sets @char_count" do
|
14
|
-
expect(counter.instance_variables).to include(:@char_count)
|
15
|
-
end
|
16
|
-
|
17
|
-
it "sets @words" do
|
18
|
-
expect(counter.instance_variables).to include(:@words)
|
19
|
-
end
|
20
|
-
|
21
|
-
it "sets @word_occurrences" do
|
22
|
-
expect(counter.instance_variables).to include(:@word_occurrences)
|
23
|
-
end
|
24
|
-
|
25
|
-
it "sets @word_lengths" do
|
26
|
-
expect(counter.instance_variables).to include(:@word_lengths)
|
27
|
-
end
|
6
|
+
let(:counter) do
|
7
|
+
tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
|
8
|
+
Counter.new(tokens)
|
28
9
|
end
|
29
10
|
|
30
|
-
describe "
|
31
|
-
it "
|
32
|
-
expect(counter.
|
33
|
-
end
|
34
|
-
|
35
|
-
it "splits words" do
|
36
|
-
expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
|
37
|
-
end
|
38
|
-
|
39
|
-
it "removes special characters" do
|
40
|
-
counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
|
41
|
-
expect(counter.words).to eq(%w[hello how do you do])
|
42
|
-
end
|
43
|
-
|
44
|
-
it "counts hyphenated words as one" do
|
45
|
-
counter = Counter.new("I am twenty-two.")
|
46
|
-
expect(counter.words).to eq(%w[i am twenty-two])
|
47
|
-
end
|
48
|
-
|
49
|
-
it "does not split words on apostrophe" do
|
50
|
-
counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
|
51
|
-
expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
|
52
|
-
end
|
53
|
-
|
54
|
-
it "does not split on unicode chars" do
|
55
|
-
counter = Counter.new("São Paulo")
|
56
|
-
expect(counter.words).to eq(%w[são paulo])
|
57
|
-
end
|
58
|
-
|
59
|
-
it "it accepts a string filter" do
|
60
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
|
61
|
-
expect(counter.words).to eq(%w[that was trevor])
|
62
|
-
end
|
63
|
-
|
64
|
-
it "it accepts a string filter with multiple words" do
|
65
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
|
66
|
-
expect(counter.words).to eq(%w[that trevor])
|
67
|
-
end
|
68
|
-
|
69
|
-
it "filters words in uppercase when using a string filter" do
|
70
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
|
71
|
-
expect(counter.words).to eq(%w[that was trevor])
|
72
|
-
end
|
73
|
-
|
74
|
-
it "accepts a regexp filter" do
|
75
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
|
76
|
-
expect(counter.words).to eq(%w[that was trevor])
|
77
|
-
end
|
78
|
-
|
79
|
-
it "accepts an array filter" do
|
80
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
|
81
|
-
expect(counter.words).to eq(%w[magnificent trevor])
|
82
|
-
end
|
83
|
-
|
84
|
-
it "accepts a lambda filter" do
|
85
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
|
86
|
-
expect(counter.words).to eq(%w[was magnificent trevor])
|
87
|
-
end
|
88
|
-
|
89
|
-
it "accepts a custom regexp" do
|
90
|
-
counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
|
91
|
-
expect(counter.words).to eq(["i", "am", "007"])
|
92
|
-
end
|
93
|
-
|
94
|
-
it "char_count should be calculated after the filter is applied" do
|
95
|
-
counter = Counter.new("I am Legend.", exclude: "I am")
|
96
|
-
expect(counter.char_count).to eq(6)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
describe "word_count" do
|
101
|
-
it "returns the correct word count" do
|
102
|
-
expect(counter.word_count).to eq(15)
|
11
|
+
describe "initialize" do
|
12
|
+
it "sets @tokens" do
|
13
|
+
expect(counter.instance_variables).to include(:@tokens)
|
103
14
|
end
|
104
15
|
end
|
105
16
|
|
106
|
-
describe "
|
107
|
-
it "returns
|
108
|
-
expect(counter.
|
109
|
-
end
|
110
|
-
|
111
|
-
it "treats capitalized words as the same word" do
|
112
|
-
counter = Counter.new("Bad, bad, piggy!")
|
113
|
-
expect(counter.word_occurrences).to eq({ "bad" => 2, "piggy" => 1 })
|
17
|
+
describe "#token_count" do
|
18
|
+
it "returns the correct number of tokens" do
|
19
|
+
expect(counter.token_count).to eq(6)
|
114
20
|
end
|
115
21
|
end
|
116
22
|
|
117
|
-
describe "
|
118
|
-
it "returns
|
119
|
-
expect(counter.
|
120
|
-
end
|
121
|
-
|
122
|
-
it "returns a two dimensional array sorted by descending word occurrence" do
|
123
|
-
counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
|
124
|
-
expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
|
23
|
+
describe "#uniq_token_count" do
|
24
|
+
it "returns the number of unique token" do
|
25
|
+
expect(counter.uniq_token_count).to eq(3)
|
125
26
|
end
|
126
27
|
end
|
127
28
|
|
128
|
-
describe "
|
129
|
-
it "returns
|
130
|
-
expect(counter.
|
131
|
-
end
|
132
|
-
|
133
|
-
it "returns highest occuring words" do
|
134
|
-
counter = Counter.new("Orange orange Apple apple banana")
|
135
|
-
expect(counter.most_occurring_words).to eq([["orange", 2],["apple", 2]])
|
29
|
+
describe "#char_count" do
|
30
|
+
it "returns the correct number of chars" do
|
31
|
+
expect(counter.char_count).to eq(26)
|
136
32
|
end
|
137
33
|
end
|
138
34
|
|
139
|
-
describe
|
140
|
-
it "returns a
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
counter = Counter.new("One two three.")
|
146
|
-
expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
|
35
|
+
describe "#token_frequency" do
|
36
|
+
it "returns a two-dimensional array where each member array is a token and its frequency in descending order" do
|
37
|
+
expected = [
|
38
|
+
['three', 3], ['woot', 2], ['one', 1]
|
39
|
+
]
|
40
|
+
expect(counter.token_frequency).to eq(expected)
|
147
41
|
end
|
148
42
|
end
|
149
43
|
|
150
|
-
describe "
|
151
|
-
it "returns
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
counter = Counter.new("I am not certain of that")
|
157
|
-
expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
|
44
|
+
describe "#token_lengths" do
|
45
|
+
it "returns a two-dimensional array where each member array is a token and its length in descending order" do
|
46
|
+
expected = [
|
47
|
+
['three', 5], ['woot', 4], ['one', 3]
|
48
|
+
]
|
49
|
+
expect(counter.token_lengths).to eq(expected)
|
158
50
|
end
|
159
51
|
end
|
160
52
|
|
161
|
-
describe "
|
162
|
-
it "returns
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
counter = Counter.new("Those whom the gods love grow young.")
|
168
|
-
expect(counter.longest_words).to eq([["those", 5],["young", 5]])
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
describe "word_density" do
|
173
|
-
it "returns an array" do
|
174
|
-
expect(counter.word_density).to be_a(Array)
|
175
|
-
end
|
176
|
-
|
177
|
-
it "returns words and their density in percent" do
|
178
|
-
counter = Counter.new("His name was Major, major Major Major.")
|
179
|
-
expect(counter.word_density).to eq([["major", 57.14], ["was", 14.29], ["name", 14.29], ["his", 14.29]])
|
53
|
+
describe "#token_density" do
|
54
|
+
it "returns a two-dimensional array where each member array is a token and its density in descending order" do
|
55
|
+
expected = [
|
56
|
+
['three', 0.5], ['woot', 0.33], ['one', 0.17]
|
57
|
+
]
|
58
|
+
expect(counter.token_density).to eq(expected)
|
180
59
|
end
|
181
60
|
|
182
61
|
it "accepts a precision" do
|
183
|
-
|
184
|
-
|
62
|
+
expected = [
|
63
|
+
['three', 0.5], ['woot', 0.3333], ['one', 0.1667]
|
64
|
+
]
|
65
|
+
expect(counter.token_density(precision: 4)).to eq(expected)
|
185
66
|
end
|
186
67
|
end
|
187
68
|
|
188
|
-
describe "
|
189
|
-
it "returns
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
it "returns the number of chars in the passed in string after the filter is applied" do
|
195
|
-
counter = Counter.new("His name was Major, major Major Major.", exclude: "Major")
|
196
|
-
expect(counter.char_count).to eq(10)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
describe "average_chars_per_word" do
|
201
|
-
it "returns the average number of chars per word" do
|
202
|
-
counter = Counter.new("His name was major, Major Major Major.")
|
203
|
-
expect(counter.average_chars_per_word).to eq(4.29)
|
204
|
-
end
|
205
|
-
|
206
|
-
it "returns the average number of chars per word after the filter is applied" do
|
207
|
-
counter = Counter.new("His name was Major, Major Major Major.", exclude: "Major")
|
208
|
-
expect(counter.average_chars_per_word).to eq(3.33)
|
209
|
-
end
|
210
|
-
|
211
|
-
it "accepts precision" do
|
212
|
-
counter = Counter.new("This line should have 39 characters minus spaces.")
|
213
|
-
expect(counter.average_chars_per_word(4)).to eq(5.5714)
|
69
|
+
describe "#most_frequent_tokens" do
|
70
|
+
it "returns a hash of the tokens with the highest frequency, where each key a token, and each value is its frequency" do
|
71
|
+
expected = {
|
72
|
+
'three' => 3
|
73
|
+
}
|
74
|
+
expect(counter.most_frequent_tokens).to eq(expected)
|
214
75
|
end
|
215
76
|
end
|
216
77
|
|
217
|
-
describe "
|
218
|
-
it "returns
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
counter = Counter.new("Up down. Down up.")
|
224
|
-
expect(counter.unique_word_count).to eq(2)
|
78
|
+
describe "#longest_tokens" do
|
79
|
+
it "returns a hash of the tokens with the highest length, where each key a token, and each value is its length" do
|
80
|
+
expected = {
|
81
|
+
'three' => 5
|
82
|
+
}
|
83
|
+
expect(counter.longest_tokens).to eq(expected)
|
225
84
|
end
|
226
85
|
end
|
227
86
|
end
|
228
|
-
|
229
|
-
describe "count" do
|
230
|
-
it "returns count for a single word" do
|
231
|
-
counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
|
232
|
-
expect(counter.count("i")).to eq(3)
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
describe "from_file" do
|
237
|
-
it "opens and reads a text file" do
|
238
|
-
counter = WordsCounted.from_file('spec/support/the_hart_and_the_hunter.txt')
|
239
|
-
expect(counter.word_count).to eq(139)
|
240
|
-
end
|
241
|
-
end
|
242
87
|
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "../spec_helper"
|
3
|
+
|
4
|
+
module WordsCounted
|
5
|
+
warn "Methods being tested are deprecated"
|
6
|
+
|
7
|
+
describe Counter do
|
8
|
+
let(:counter) do
|
9
|
+
tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
|
10
|
+
Counter.new(tokens)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#word_density" do
|
14
|
+
it "returns words and their density in percent" do
|
15
|
+
expected = [
|
16
|
+
['three', 50.0], ['woot', 33.33], ['one', 16.67]
|
17
|
+
]
|
18
|
+
expect(counter.word_density).to eq(expected)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "accepts a precision" do
|
22
|
+
expected = [
|
23
|
+
['three', 50.0], ['woot', 33.3333], ['one', 16.6667]
|
24
|
+
]
|
25
|
+
expect(counter.word_density(4)).to eq(expected)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#word_occurrences" do
|
30
|
+
it "returns a two dimensional array sorted by descending word occurrence" do
|
31
|
+
expected = {
|
32
|
+
'three' => 3, 'woot' => 2, 'one' => 1
|
33
|
+
}
|
34
|
+
expect(counter.word_occurrences).to eq(expected)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#sorted_word_occurrences" do
|
39
|
+
it "returns a two dimensional array sorted by descending word occurrence" do
|
40
|
+
expected = [
|
41
|
+
['three', 3], ['woot', 2], ['one', 1]
|
42
|
+
]
|
43
|
+
expect(counter.sorted_word_occurrences).to eq(expected)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "#word_lengths" do
|
48
|
+
it "returns a hash of of words and their length sorted descending by length" do
|
49
|
+
expected = {
|
50
|
+
'three' => 5, 'woot' => 4, 'one' => 3
|
51
|
+
}
|
52
|
+
expect(counter.word_lengths).to eq(expected)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "#sorted_word_lengths" do
|
57
|
+
it "returns a two dimensional array sorted by descending word length" do
|
58
|
+
expected = [
|
59
|
+
['three', 5], ['woot', 4], ['one', 3]
|
60
|
+
]
|
61
|
+
expect(counter.sorted_word_lengths).to eq(expected)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#longest_words" do
|
66
|
+
it "returns a two-dimentional array of the longest words and their lengths" do
|
67
|
+
expected = [
|
68
|
+
['three', 5]
|
69
|
+
]
|
70
|
+
expect(counter.longest_words).to eq(expected)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "#most_occurring_words" do
|
75
|
+
it "returns a two-dimentional array of words with the highest frequency and their frequencies" do
|
76
|
+
expected = [
|
77
|
+
['three', 3]
|
78
|
+
]
|
79
|
+
expect(counter.most_occurring_words).to eq(expected)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "#average_chars_per_word" do
|
84
|
+
it "returns the average number of chars per word" do
|
85
|
+
expect(counter.average_chars_per_word).to eq(4.33)
|
86
|
+
end
|
87
|
+
|
88
|
+
it "accepts precision" do
|
89
|
+
expect(counter.average_chars_per_word(4)).to eq(4.3333)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "#count" do
|
94
|
+
it "returns count for a single word" do
|
95
|
+
expect(counter.count('one')).to eq(1)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "../spec_helper"
|
3
|
+
|
4
|
+
module WordsCounted
|
5
|
+
describe Tokeniser do
|
6
|
+
describe "initialize" do
|
7
|
+
it "sets @input" do
|
8
|
+
tokeniser = Tokeniser.new("Hello World!")
|
9
|
+
expect(tokeniser.instance_variables).to include(:@input)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#tokenise" do
|
14
|
+
it "normalises tokens and returns an array" do
|
15
|
+
tokens = Tokeniser.new("Hello HELLO").tokenise
|
16
|
+
expect(tokens).to eq(%w[hello hello])
|
17
|
+
end
|
18
|
+
|
19
|
+
context "without arguments" do
|
20
|
+
it "removes none alpha-numeric chars" do
|
21
|
+
tokens = Tokeniser.new("Hello world! # $ % 12345 * & % ?").tokenise
|
22
|
+
expect(tokens).to eq(%w[hello world])
|
23
|
+
end
|
24
|
+
|
25
|
+
it "does not split on hyphens" do
|
26
|
+
tokens = Tokeniser.new("I am twenty-two.").tokenise
|
27
|
+
expect(tokens).to eq(%w[i am twenty-two])
|
28
|
+
end
|
29
|
+
|
30
|
+
it "does not split on apostrophe" do
|
31
|
+
tokens = Tokeniser.new("Bust 'em! It's Jim's gang.").tokenise
|
32
|
+
expect(tokens).to eq(%w[bust 'em it's jim's gang])
|
33
|
+
end
|
34
|
+
|
35
|
+
it "does not split on unicode chars" do
|
36
|
+
tokens = Tokeniser.new("Bayrūt").tokenise
|
37
|
+
expect(tokens).to eq(%w[bayrūt])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
context "with `pattern` options" do
|
42
|
+
it "splits on accepts a custom pattern" do
|
43
|
+
tokens = Tokeniser.new("We-Are-ALL").tokenise(pattern: /[^-]+/)
|
44
|
+
expect(tokens).to eq(%w[we are all])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "with `exclude` option" do
|
49
|
+
context "as a string" do
|
50
|
+
let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
|
51
|
+
|
52
|
+
it "it accepts a string filter" do
|
53
|
+
tokens = tokeniser.tokenise(exclude: "magnificent")
|
54
|
+
expect(tokens).to eq(%w[that was trevor])
|
55
|
+
end
|
56
|
+
|
57
|
+
it "accepts a string filter with multiple space-delimited tokens" do
|
58
|
+
tokens = tokeniser.tokenise(exclude: "was magnificent")
|
59
|
+
expect(tokens).to eq(%w[that trevor])
|
60
|
+
end
|
61
|
+
|
62
|
+
it "normalises string filter" do
|
63
|
+
tokens = tokeniser.tokenise(exclude: "MAGNIFICENT")
|
64
|
+
expect(tokens).to eq(%w[that was trevor])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "as a regular expression" do
|
69
|
+
it "filters on match" do
|
70
|
+
tokeniser = Tokeniser.new("That was magnificent, Trevor.")
|
71
|
+
tokens = tokeniser.tokenise(exclude: /magnificent/i)
|
72
|
+
expect(tokens).to eq(%w[that was trevor])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "as a lambda" do
|
77
|
+
it "calls lambda" do
|
78
|
+
tokeniser = Tokeniser.new("That was magnificent, Trevor.")
|
79
|
+
tokens = tokeniser.tokenise(exclude: ->(token) { token.length < 5 })
|
80
|
+
expect(tokens).to eq(%w[magnificent trevor])
|
81
|
+
end
|
82
|
+
|
83
|
+
it "accepts a symbol for shorthand notation" do
|
84
|
+
tokeniser = Tokeniser.new("That was magnificent, محمد.}")
|
85
|
+
tokens = tokeniser.tokenise(exclude: :ascii_only?)
|
86
|
+
expect(tokens).to eq(%w[محمد])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
context "as an array" do
|
91
|
+
let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
|
92
|
+
|
93
|
+
it "accepts an array of strings" do
|
94
|
+
tokens = tokeniser.tokenise(exclude: ["magnificent"])
|
95
|
+
expect(tokens).to eq(%w[that was trevor])
|
96
|
+
end
|
97
|
+
|
98
|
+
it "accepts an array regular expressions" do
|
99
|
+
tokens = tokeniser.tokenise(exclude: [/that/, /was/])
|
100
|
+
expect(tokens).to eq(%w[magnificent trevor])
|
101
|
+
end
|
102
|
+
|
103
|
+
it "accepts an array of lambdas" do
|
104
|
+
filters = [
|
105
|
+
->(token) { token.length < 4 },
|
106
|
+
->(token) { token.length > 6 }
|
107
|
+
]
|
108
|
+
tokens = tokeniser.tokenise(exclude: filters)
|
109
|
+
expect(tokens).to eq(%w[that trevor])
|
110
|
+
end
|
111
|
+
|
112
|
+
it "accepts a mixed array" do
|
113
|
+
filters = [
|
114
|
+
"that",
|
115
|
+
->(token) { token.length < 4 },
|
116
|
+
/magnificent/
|
117
|
+
]
|
118
|
+
tokens = tokeniser.tokenise(exclude: filters)
|
119
|
+
expect(tokens).to eq(["trevor"])
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
context "with an invalid filter" do
|
124
|
+
it "raises an `ArgumentError`" do
|
125
|
+
expect {
|
126
|
+
Tokeniser.new("Hello world!").tokenise(exclude: 1)
|
127
|
+
}.to raise_error(ArgumentError)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "spec_helper"
|
3
|
+
|
4
|
+
describe WordsCounted do
|
5
|
+
describe ".from_file" do
|
6
|
+
let(:file_path) { "spec/support/the_hart_and_the_hunter.txt" }
|
7
|
+
|
8
|
+
it "opens and reads a text file" do
|
9
|
+
counter = WordsCounted.from_file(file_path)
|
10
|
+
expect(counter.token_count).to eq(139)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "opens and reads a text file with options" do
|
14
|
+
counter = WordsCounted.from_file(file_path, exclude: "hunter")
|
15
|
+
expect(counter.token_count).to eq(135)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe ".count" do
|
20
|
+
let(:string) do
|
21
|
+
"We are all in the gutter, but some of us are looking at the stars."
|
22
|
+
end
|
23
|
+
|
24
|
+
it "returns a counter instance with given input as tokens" do
|
25
|
+
counter = WordsCounted.count(string)
|
26
|
+
expect(counter.token_count).to eq(15)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "returns a counter instance with given input and options" do
|
30
|
+
counter = WordsCounted.count(string, exclude: "the gutter")
|
31
|
+
expect(counter.token_count).to eq(12)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/words_counted.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.version = WordsCounted::VERSION
|
10
10
|
spec.authors = ["Mohamad El-Husseini"]
|
11
11
|
spec.email = ["husseini.mel@gmail.com"]
|
12
|
-
spec.description = %q{A Ruby
|
12
|
+
spec.description = %q{A Ruby natural language processor to extract stats from text, such was word count and more.}
|
13
13
|
spec.summary = %q{See README.}
|
14
14
|
spec.homepage = "https://github.com/abitdodgy/words_counted"
|
15
15
|
spec.license = "MIT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words_counted
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mohamad El-Husseini
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,7 +66,8 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
description: A Ruby
|
69
|
+
description: A Ruby natural language processor to extract stats from text, such was
|
70
|
+
word count and more.
|
70
71
|
email:
|
71
72
|
- husseini.mel@gmail.com
|
72
73
|
executables: []
|
@@ -74,19 +75,28 @@ extensions: []
|
|
74
75
|
extra_rdoc_files: []
|
75
76
|
files:
|
76
77
|
- ".gitignore"
|
78
|
+
- ".hound.yml"
|
77
79
|
- ".rspec"
|
80
|
+
- ".ruby-style.yml"
|
81
|
+
- ".travis.yml"
|
78
82
|
- ".yardopts"
|
79
83
|
- CHANGELOG.md
|
80
84
|
- Gemfile
|
81
85
|
- LICENSE.txt
|
82
86
|
- README.md
|
83
87
|
- Rakefile
|
88
|
+
- lib/refinements/hash_refinements.rb
|
84
89
|
- lib/words_counted.rb
|
85
90
|
- lib/words_counted/counter.rb
|
91
|
+
- lib/words_counted/deprecated.rb
|
92
|
+
- lib/words_counted/tokeniser.rb
|
86
93
|
- lib/words_counted/version.rb
|
87
94
|
- spec/spec_helper.rb
|
88
95
|
- spec/support/the_hart_and_the_hunter.txt
|
89
96
|
- spec/words_counted/counter_spec.rb
|
97
|
+
- spec/words_counted/deprecated_spec.rb
|
98
|
+
- spec/words_counted/tokeniser_spec.rb
|
99
|
+
- spec/words_counted_spec.rb
|
90
100
|
- words_counted.gemspec
|
91
101
|
homepage: https://github.com/abitdodgy/words_counted
|
92
102
|
licenses:
|
@@ -108,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
118
|
version: '0'
|
109
119
|
requirements: []
|
110
120
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
121
|
+
rubygems_version: 2.4.5
|
112
122
|
signing_key:
|
113
123
|
specification_version: 4
|
114
124
|
summary: See README.
|
@@ -116,3 +126,6 @@ test_files:
|
|
116
126
|
- spec/spec_helper.rb
|
117
127
|
- spec/support/the_hart_and_the_hunter.txt
|
118
128
|
- spec/words_counted/counter_spec.rb
|
129
|
+
- spec/words_counted/deprecated_spec.rb
|
130
|
+
- spec/words_counted/tokeniser_spec.rb
|
131
|
+
- spec/words_counted_spec.rb
|