words_counted 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cba04e2004b13b0ee7b99e46cdf6549f6aebe2f6
4
- data.tar.gz: 885d494f7f2b2af40f59ed08aaca1db7ec89a54b
3
+ metadata.gz: d6302c1802d7da076d1ddafdcbe70e46a89c8f33
4
+ data.tar.gz: 873efaa5e58f883e0dde99094ca53952d46217c7
5
5
  SHA512:
6
- metadata.gz: e2009cd4b401da2b43047699a073a3f541654384d831d73c0d436016eb88325e29c179a59961c6d1d8d48a865f34a2da78e014a28a5e0cf4ccf714cafa7a6bb5
7
- data.tar.gz: f46e0031db714c0985ef4b2dee5d1f294c9ab0bdb629157110af0b26b76280bfe440207b4f6920156681cc91ded0246e3e66b6dcf26717208cc73ebbe4e86821
6
+ metadata.gz: 0e6ddb8db9c060432066d86aed2efe20aa95dee2019d54c950007170c0ffbbcff16fa27a0377419b0d1b718be1625a4376ee9c687a4ae67073aaffe9ef363157
7
+ data.tar.gz: 9df2a0cefe14b9ac77d1741f8980d1b1fb4d8b770738fbd69c8870f73da4b653a1d9462ac8813f88dc48af36e03718773523985f5be0f4999177a6b0a2a89662
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .idea/
data/.hound.yml ADDED
@@ -0,0 +1,2 @@
1
+ ruby:
2
+ config_file: .ruby-style.yml
data/.ruby-style.yml ADDED
@@ -0,0 +1,2 @@
1
+ Style/IfUnlessModifier:
2
+ MaxLineLength: 120
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.1
5
+ - 2.2
6
+ - ruby-head
7
+
8
+ gemfile:
9
+ - Gemfile
data/.yardopts CHANGED
@@ -1,3 +1,4 @@
1
- --title 'Word Counter for Ruby'
1
+ --title 'Ruby natural language processor'
2
2
  --private
3
- --markup markdown
3
+ --markup markdown
4
+ --hide-api private
data/CHANGELOG.md CHANGED
@@ -1,3 +1,27 @@
1
+ ## Version 1.0
2
+
3
+ This version brings lots of improvements to code organisation. The tokeniser has been extracted into its own class. All methods in `Counter` have either renamed or deprecated. Deprecated methods and their tests have moved into their own modules. Using them will trigger warnings with upgrade instructions outlined below.
4
+
5
+ 1. Extracted tokenisation behaviour from `Counter` into a `Tokeniser` class.
6
+ 2. Deprecated all methods that have `word` in their name. Most are renamed such that `word` became `token`. They will be removed in version 1.1.
7
+ - Deprecated `word_count` in favor of `token_count`
8
+ - Deprecated `unique_word_count` in favor of `unique_token_count`
9
+ - Deprecated `word_occurrences` and `sorted_word_occurrences` in favor of `token_frequency`
10
+ - Deprecated `word_lengths` and `sorted_word_lengths` in favor of `token_lenghts`
11
+ - Deprecated `word_density` in favor of `token_density`
12
+ - Deprecated `most_occurring_words` in favor of `most_frequent_tokens`
13
+ - Deprecated `longest_words` in favor of `longest_tokens`
14
+ - Deprecated `average_chars_per_word` in favor of `average_chars_per_token`
15
+ - Deprecated `count`. Use `Array#count` instead.
16
+ 3. `token_lengths`, which replaces `word_lengths` returns a sorted two-dimensional array instead of a hash. It behaves exactly like `sorted_word_lengths` which has been deprecated. Use `token_lengths.to_h` for old behaviour.
17
+ 4. `token_frequency`, which replaces `word_occurences` returns a sorted two-dimensional array instead of a hash. It behaves like `sorted_word_occurrences` which has been deprecated. Use `token_frequency.to_h` for old behaviour.
18
+ 5. `token_density`, which replaces `word_density`, returns a decimal with a precision of 2, not a percent. Use `token_density * 100` for old behaviour.
19
+ 6. Add a refinement to Hash under `lib/refinements/hash_refinements.rb` to quickly sort by descending value.
20
+ 7. Extracted all deprecated methods to their own module, and their tests to their own spec file.
21
+ 8. Added a base `words_counted_spec.rb` and moved `.from_file` test to the new file.
22
+ 9. Added Travis continuous integration.
23
+ 10. Add documentation to the code.
24
+
1
25
  ## Version 0.1.5
2
26
 
3
27
  1. Removed `to_f` from the dividend in `average_chars_per_word` and `word_densities`. The divisor is a float, and dividing by a float returns a float.
@@ -0,0 +1,10 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Refinements
3
+ module HashRefinements
4
+ refine Hash do
5
+ def sort_by_value_desc
6
+ sort_by(&:last).reverse
7
+ end
8
+ end
9
+ end
10
+ end
@@ -1,96 +1,128 @@
1
1
  # -*- encoding : utf-8 -*-
2
- module WordsCounted
3
- class Counter
4
- attr_reader :words, :word_occurrences, :word_lengths, :char_count
5
-
6
- WORD_REGEXP = /[\p{Alpha}\-']+/
7
2
 
8
- def self.from_file(path, options = {})
9
- File.open(path) do |file|
10
- new file.read, options
11
- end
12
- end
13
-
14
- def initialize(string, options = {})
15
- @options = options
16
- exclude = filter_proc(options[:exclude])
17
- @words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
18
- @char_count = words.join.size
19
- @word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
20
- @word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
21
- end
3
+ require "words_counted/deprecated"
22
4
 
23
- def word_count
24
- words.size
25
- end
5
+ module WordsCounted
6
+ using Refinements::HashRefinements
26
7
 
27
- def unique_word_count
28
- words.uniq.size
29
- end
8
+ class Counter
9
+ include Deprecated
30
10
 
31
- def average_chars_per_word(precision = 2)
32
- (char_count / word_count.to_f).round(precision)
33
- end
11
+ attr_reader :tokens
34
12
 
35
- def most_occurring_words
36
- highest_ranking word_occurrences
13
+ def initialize(tokens)
14
+ @tokens = tokens
37
15
  end
38
16
 
39
- def longest_words
40
- highest_ranking word_lengths
17
+ # Returns the number of tokens.
18
+ #
19
+ # @example
20
+ # Counter.new(%w[one two two three three three]).token_count
21
+ # # => 6
22
+ #
23
+ # @return [Integer] The number of tokens.
24
+ def token_count
25
+ tokens.size
41
26
  end
42
27
 
43
- def word_density(precision = 2)
44
- word_densities = word_occurrences.each_with_object({}) do |(word, occ), hash|
45
- hash[word] = (occ / word_count.to_f * 100).round(precision)
46
- end
47
- sort_by_descending_value word_densities
28
+ # Returns the number of unique tokens.
29
+ #
30
+ # @example
31
+ # Counter.new(%w[one two two three three three]).uniq_token_count
32
+ # # => 3
33
+ #
34
+ # @return [Integer] The number of unique tokens.
35
+ def uniq_token_count
36
+ tokens.uniq.size
48
37
  end
49
38
 
50
- def sorted_word_occurrences
51
- sort_by_descending_value word_occurrences
39
+ # Returns the character count of all tokens.
40
+ #
41
+ # @example
42
+ # Counter.new(%w[one two]).char_count
43
+ # # => 6
44
+ #
45
+ # @return [Integer] The total char count of tokens.
46
+ def char_count
47
+ tokens.join.size
52
48
  end
53
49
 
54
- def sorted_word_lengths
55
- sort_by_descending_value word_lengths
50
+ # Returns a sorted two-dimensional array where each member array is a token and its frequency.
51
+ # The array is sorted by frequency in descending order.
52
+ #
53
+ # @example
54
+ # Counter.new(%w[one two two three three three]).token_frequency
55
+ # # => [ ['three', 3], ['two', 2], ['one', 1] ]
56
+ #
57
+ # @return [Array<Array<String, Integer>>]
58
+ def token_frequency
59
+ tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
56
60
  end
57
61
 
58
- def count(match)
59
- words.select { |word| word == match.downcase }.size
62
+ # Returns a sorted two-dimensional array where each member array is a token and its length.
63
+ # The array is sorted by length in descending order.
64
+ #
65
+ # @example
66
+ # Counter.new(%w[one two three four five]).token_lenghts
67
+ # # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
68
+ #
69
+ # @return [Array<Array<String, Integer>>]
70
+ def token_lengths
71
+ tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
60
72
  end
61
73
 
62
- private
63
-
64
- def highest_ranking(entries)
65
- entries.group_by { |_, value| value }.sort.last.last
74
+ # Returns a sorted two-dimensional array where each member array is a token and its density
75
+ # as a float, rounded to a precision of two decimal places. It accepts a precision argument
76
+ # which defaults to `2`.
77
+ #
78
+ # @example
79
+ # Counter.new(%w[Maj. Major Major Major]).token_density
80
+ # # => [ ['major', .75], ['maj', .25] ]
81
+ #
82
+ # @example with `precision`
83
+ # Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
84
+ # # => [ ['major', .7500], ['maj', .2500] ]
85
+ #
86
+ # @param [Integer] precision The number of decimal places to round density to.
87
+ # @return [Array<Array<String, Float>>]
88
+ def token_density(precision: 2)
89
+ token_frequency.each_with_object({}) { |(token, freq), hash|
90
+ hash[token] = (freq / token_count.to_f).round(precision)
91
+ }.sort_by_value_desc
66
92
  end
67
93
 
68
- def sort_by_descending_value(entries)
69
- entries.sort_by { |_, value| value }.reverse
94
+ # Returns a hash of tokens and their frequencies for tokens with the highest frequency.
95
+ #
96
+ # @example
97
+ # Counter.new(%w[one once two two twice twice]).most_frequent_tokens
98
+ # # => { 'two' => 2, 'twice' => 2 }
99
+ #
100
+ # @return [Hash<String, Integer>]
101
+ def most_frequent_tokens
102
+ token_frequency.group_by(&:last).max.last.to_h
70
103
  end
71
104
 
72
- def regexp
73
- @options[:regexp] || WORD_REGEXP
105
+ # Returns a hash of tokens and their lengths for tokens with the highest length.
106
+ #
107
+ # @example
108
+ # Counter.new(%w[one three five seven]).longest_tokens
109
+ # # => { 'three' => 5, 'seven' => 5 }
110
+ #
111
+ # @return [Hash<String, Integer>]
112
+ def longest_tokens
113
+ token_lengths.group_by(&:last).max.last.to_h
74
114
  end
75
115
 
76
- def filter_proc(filter)
77
- if filter.respond_to?(:to_a)
78
- filter_procs = Array(filter).map(&method(:filter_proc))
79
- ->(word) {
80
- filter_procs.any? { |p| p.call(word) }
81
- }
82
- elsif filter.respond_to?(:to_str)
83
- exclusion_list = filter.split.collect(&:downcase)
84
- ->(word) {
85
- exclusion_list.include?(word)
86
- }
87
- elsif regexp_filter = Regexp.try_convert(filter)
88
- Proc.new { |word| word =~ regexp_filter }
89
- elsif filter.respond_to?(:to_proc)
90
- filter.to_proc
91
- else
92
- raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
93
- end
116
+ # Returns the average char count per token rounded to a precision of two decimal places.
117
+ # Accepts a `precision` argument.
118
+ #
119
+ # @example
120
+ # Counter.new(%w[one three five seven]).average_chars_per_token
121
+ # # => 4.25
122
+ #
123
+ # @return [Float] The average char count per token.
124
+ def average_chars_per_token(precision: 2)
125
+ (char_count / token_count.to_f).round(precision)
94
126
  end
95
127
  end
96
128
  end
@@ -0,0 +1,76 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ module Deprecated
4
+ # @deprecated use `Counter#token_count`
5
+ def word_count
6
+ warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
7
+ token_count
8
+ end
9
+
10
+ # @deprecated use `Counter#uniq_token_count`
11
+ def unique_word_count
12
+ warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
13
+ uniq_token_count
14
+ end
15
+
16
+ # @deprecated use `Counter#token_frequency`
17
+ def word_occurrences
18
+ warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
19
+ warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
20
+ token_frequency.to_h
21
+ end
22
+
23
+ # @deprecated use `Counter#token_lengths`
24
+ def word_lengths
25
+ warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
26
+ warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
27
+ token_lengths.to_h
28
+ end
29
+
30
+ # @deprecated use `Counter#token_density`
31
+ def word_density(precision = 2)
32
+ warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
33
+ warn "`Counter#token_density` returns density as decimal and not percent"
34
+
35
+ token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
36
+ end
37
+
38
+ # @deprecated use `Counter#token_frequency`
39
+ def sorted_word_occurrences
40
+ warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
41
+ token_frequency
42
+ end
43
+
44
+ # @deprecated use `Counter#token_lengths`
45
+ def sorted_word_lengths
46
+ warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
47
+ token_lengths
48
+ end
49
+
50
+ # @deprecated use `Counter#most_frequent_tokens`
51
+ def most_occurring_words
52
+ warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
53
+ warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
54
+ most_frequent_tokens.to_a
55
+ end
56
+
57
+ # @deprecated use `Counter#longest_tokens`
58
+ def longest_words
59
+ warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
60
+ warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
61
+ longest_tokens.to_a
62
+ end
63
+
64
+ # @deprecated use `Counter#average_chars_per_token`
65
+ def average_chars_per_word(precision = 2)
66
+ warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
67
+ average_chars_per_token(precision: precision)
68
+ end
69
+
70
+ # @deprecated use `Counter#average_chars_per_token`
71
+ def count(token)
72
+ warn "`Counter#count` is deprecated, please use `Array#count`"
73
+ tokens.count(token.downcase)
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,139 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ class Tokeniser
4
+ # Takes a string and breaks it into an array of tokens.
5
+ # Using `pattern` and `exclude` allows for powerful tokenisation strategies.
6
+ #
7
+ # @example
8
+ # tokeniser = WordsCounted::Tokeniser.new("We are all in the gutter, but some of us are looking at the stars.")
9
+ # tokeniser.tokenise(exclude: "We are all in the gutter")
10
+ # # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
11
+
12
+ # Default tokenisation strategy
13
+ TOKEN_REGEXP = /[\p{Alpha}\-']+/
14
+
15
+ # Initialises state with a string that will be tokenised.
16
+ #
17
+ # @param [String] input The string to tokenise.
18
+ # @return [Tokeniser]
19
+ def initialize(input)
20
+ @input = input
21
+ end
22
+
23
+ # Converts a string into an array of tokens using a regular expression.
24
+ # If a regexp is not provided a default one is used. See {Tokenizer.TOKEN_REGEXP}.
25
+ #
26
+ # Use `exclude` to remove tokens from the final list. `exclude` can be a string,
27
+ # a regular expression, a lambda, a symbol, or an array of one or more of those types.
28
+ # This allows for powerful and flexible tokenisation strategies.
29
+ #
30
+ # @example
31
+ # WordsCounted::Tokeniser.new("Hello World").tokenise
32
+ # # => ['hello', 'world']
33
+ #
34
+ # @example With `pattern`
35
+ # WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
36
+ # # => ['hello', 'mohamad']
37
+ #
38
+ # @example With `exclude` as a string
39
+ # WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
40
+ # # => ['sami']
41
+ #
42
+ # @example With `exclude` as a regexp
43
+ # WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
44
+ # # => ['dani']
45
+ #
46
+ # @example With `exclude` as a lambda
47
+ # WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(exclude: ->(token) { token.length > 6 })
48
+ # # => ['sami']
49
+ #
50
+ # @example With `exclude` as a symbol
51
+ # WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
52
+ # # => ['محمد']
53
+ #
54
+ # @example With `exclude` as an array of strings
55
+ # WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(exclude: ["goodbye hello"])
56
+ # # => ['sami', 'and', dani']
57
+ #
58
+ # @example With `exclude` as an array of regular expressions
59
+ # WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(exclude: [/goodbye/i, /and/i])
60
+ # # => ['hello', 'dani']
61
+ #
62
+ # @example With `exclude` as an array of lambdas
63
+ # t = WordsCounted::Tokeniser.new("Special Agent 007")
64
+ # t.tokenise(exclude: [->(t) { t.to_i.odd? }, ->(t) { t.length > 5}])
65
+ # # => ['agent']
66
+ #
67
+ # @example With `exclude` as a mixed array
68
+ # t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
69
+ # t.tokenise(exclude: [:ascii_only?, /محمد/, ->(t) { t.length > 6}, "و"])
70
+ # # => => ["هي", "سامي", "ودان
71
+ #
72
+ # @param [Regexp] pattern The string to tokenise.
73
+ # @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol nil] exclude The filter to apply.
74
+ # @return [Array] the array of filtered tokens.
75
+ def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
76
+ filter_proc = filter_to_proc(exclude)
77
+ @input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
78
+ end
79
+
80
+ private
81
+
82
+ # This method converts any arguments into a callable object. The return value of this
83
+ # is then used to determine whether a token should be excluded from the final list or not.
84
+ #
85
+ # `filter` can be a string, a regular expression, a lambda, a symbol, or an array
86
+ # of any combination of those types.
87
+ #
88
+ # If `filter` is a string, see {Tokeniser#filter_proc_from_string}.
89
+ # If `filter` is a an array, see {Tokeniser#filter_procs_from_array}.
90
+ #
91
+ # If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
92
+ # is returned that checks the token for a match. If a symbol is passed, it is converted to
93
+ # a proc.
94
+ #
95
+ # This method depends on `nil` responding `to_a` with an empty array, which
96
+ # avoids having to check if `exclude` was passed.
97
+ #
98
+ # @api private
99
+ def filter_to_proc(filter)
100
+ if filter.respond_to?(:to_a)
101
+ filter_procs_from_array(filter)
102
+ elsif filter.respond_to?(:to_str)
103
+ filter_proc_from_string(filter)
104
+ elsif regexp_filter = Regexp.try_convert(filter)
105
+ ->(token) {
106
+ token =~ regexp_filter
107
+ }
108
+ elsif filter.respond_to?(:to_proc)
109
+ filter.to_proc
110
+ else
111
+ raise ArgumentError,
112
+ "`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
113
+ end
114
+ end
115
+
116
+ # Converts an array of `filters` to an array of lambdas, and returns a lambda that calls
117
+ # each lambda in the resulting array. If any lambda returns true the token is excluded
118
+ # from the final list.
119
+ #
120
+ # @api private
121
+ def filter_procs_from_array(filter)
122
+ filter_procs = Array(filter).map &method(:filter_to_proc)
123
+ ->(token) {
124
+ filter_procs.any? { |pro| pro.call(token) }
125
+ }
126
+ end
127
+
128
+ # Converts a string `filter` to an array, and returns a lambda
129
+ # that returns true if the token is included in the array.
130
+ #
131
+ # @api private
132
+ def filter_proc_from_string(filter)
133
+ normalized_exclusion_list = filter.split.map(&:downcase)
134
+ ->(token) {
135
+ normalized_exclusion_list.include?(token)
136
+ }
137
+ end
138
+ end
139
+ end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- VERSION = "0.1.5"
3
+ VERSION = "1.0.0"
4
4
  end
data/lib/words_counted.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require "words_counted/version"
2
+ require "refinements/hash_refinements"
3
+
4
+ require "words_counted/tokeniser"
3
5
  require "words_counted/counter"
6
+ require "words_counted/version"
4
7
 
5
8
  begin
6
9
  require "pry"
@@ -9,10 +12,14 @@ end
9
12
 
10
13
  module WordsCounted
11
14
  def self.count(string, options = {})
12
- Counter.new(string, options)
15
+ tokens = Tokeniser.new(string).tokenise(options)
16
+ Counter.new(tokens)
13
17
  end
14
18
 
15
19
  def self.from_file(path, options = {})
16
- Counter.from_file(path, options)
20
+ tokens = File.open(path) do |file|
21
+ Tokeniser.new(file.read).tokenise(options)
22
+ end
23
+ Counter.new(tokens)
17
24
  end
18
25
  end
@@ -3,240 +3,85 @@ require_relative "../spec_helper"
3
3
 
4
4
  module WordsCounted
5
5
  describe Counter do
6
- let(:counter) { Counter.new("We are all in the gutter, but some of us are looking at the stars.") }
7
-
8
- describe "initialize" do
9
- it "sets @options" do
10
- expect(counter.instance_variables).to include(:@options)
11
- end
12
-
13
- it "sets @char_count" do
14
- expect(counter.instance_variables).to include(:@char_count)
15
- end
16
-
17
- it "sets @words" do
18
- expect(counter.instance_variables).to include(:@words)
19
- end
20
-
21
- it "sets @word_occurrences" do
22
- expect(counter.instance_variables).to include(:@word_occurrences)
23
- end
24
-
25
- it "sets @word_lengths" do
26
- expect(counter.instance_variables).to include(:@word_lengths)
27
- end
6
+ let(:counter) do
7
+ tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
8
+ Counter.new(tokens)
28
9
  end
29
10
 
30
- describe "words" do
31
- it "returns an array" do
32
- expect(counter.words).to be_a(Array)
33
- end
34
-
35
- it "splits words" do
36
- expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
37
- end
38
-
39
- it "removes special characters" do
40
- counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
41
- expect(counter.words).to eq(%w[hello how do you do])
42
- end
43
-
44
- it "counts hyphenated words as one" do
45
- counter = Counter.new("I am twenty-two.")
46
- expect(counter.words).to eq(%w[i am twenty-two])
47
- end
48
-
49
- it "does not split words on apostrophe" do
50
- counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
51
- expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
52
- end
53
-
54
- it "does not split on unicode chars" do
55
- counter = Counter.new("São Paulo")
56
- expect(counter.words).to eq(%w[são paulo])
57
- end
58
-
59
- it "it accepts a string filter" do
60
- counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
61
- expect(counter.words).to eq(%w[that was trevor])
62
- end
63
-
64
- it "it accepts a string filter with multiple words" do
65
- counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
66
- expect(counter.words).to eq(%w[that trevor])
67
- end
68
-
69
- it "filters words in uppercase when using a string filter" do
70
- counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
71
- expect(counter.words).to eq(%w[that was trevor])
72
- end
73
-
74
- it "accepts a regexp filter" do
75
- counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
76
- expect(counter.words).to eq(%w[that was trevor])
77
- end
78
-
79
- it "accepts an array filter" do
80
- counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
81
- expect(counter.words).to eq(%w[magnificent trevor])
82
- end
83
-
84
- it "accepts a lambda filter" do
85
- counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
86
- expect(counter.words).to eq(%w[was magnificent trevor])
87
- end
88
-
89
- it "accepts a custom regexp" do
90
- counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
91
- expect(counter.words).to eq(["i", "am", "007"])
92
- end
93
-
94
- it "char_count should be calculated after the filter is applied" do
95
- counter = Counter.new("I am Legend.", exclude: "I am")
96
- expect(counter.char_count).to eq(6)
97
- end
98
- end
99
-
100
- describe "word_count" do
101
- it "returns the correct word count" do
102
- expect(counter.word_count).to eq(15)
11
+ describe "initialize" do
12
+ it "sets @tokens" do
13
+ expect(counter.instance_variables).to include(:@tokens)
103
14
  end
104
15
  end
105
16
 
106
- describe "word_occurrences" do
107
- it "returns a hash" do
108
- expect(counter.word_occurrences).to be_a(Hash)
109
- end
110
-
111
- it "treats capitalized words as the same word" do
112
- counter = Counter.new("Bad, bad, piggy!")
113
- expect(counter.word_occurrences).to eq({ "bad" => 2, "piggy" => 1 })
17
+ describe "#token_count" do
18
+ it "returns the correct number of tokens" do
19
+ expect(counter.token_count).to eq(6)
114
20
  end
115
21
  end
116
22
 
117
- describe "sorted_word_occurrences" do
118
- it "returns an array" do
119
- expect(counter.sorted_word_occurrences).to be_a(Array)
120
- end
121
-
122
- it "returns a two dimensional array sorted by descending word occurrence" do
123
- counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
124
- expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
23
+ describe "#uniq_token_count" do
24
+ it "returns the number of unique token" do
25
+ expect(counter.uniq_token_count).to eq(3)
125
26
  end
126
27
  end
127
28
 
128
- describe "most_occurring_words" do
129
- it "returns an array" do
130
- expect(counter.most_occurring_words).to be_a(Array)
131
- end
132
-
133
- it "returns highest occuring words" do
134
- counter = Counter.new("Orange orange Apple apple banana")
135
- expect(counter.most_occurring_words).to eq([["orange", 2],["apple", 2]])
29
+ describe "#char_count" do
30
+ it "returns the correct number of chars" do
31
+ expect(counter.char_count).to eq(26)
136
32
  end
137
33
  end
138
34
 
139
- describe 'word_lengths' do
140
- it "returns a hash" do
141
- expect(counter.word_lengths).to be_a(Hash)
142
- end
143
-
144
- it "returns a hash of word lengths" do
145
- counter = Counter.new("One two three.")
146
- expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
35
+ describe "#token_frequency" do
36
+ it "returns a two-dimensional array where each member array is a token and its frequency in descending order" do
37
+ expected = [
38
+ ['three', 3], ['woot', 2], ['one', 1]
39
+ ]
40
+ expect(counter.token_frequency).to eq(expected)
147
41
  end
148
42
  end
149
43
 
150
- describe "sorted_word_lengths" do
151
- it "returns an array" do
152
- expect(counter.sorted_word_lengths).to be_a(Array)
153
- end
154
-
155
- it "returns a two dimensional array sorted by descending word length" do
156
- counter = Counter.new("I am not certain of that")
157
- expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
44
+ describe "#token_lengths" do
45
+ it "returns a two-dimensional array where each member array is a token and its length in descending order" do
46
+ expected = [
47
+ ['three', 5], ['woot', 4], ['one', 3]
48
+ ]
49
+ expect(counter.token_lengths).to eq(expected)
158
50
  end
159
51
  end
160
52
 
161
- describe "longest_words" do
162
- it "returns an array" do
163
- expect(counter.longest_words).to be_a(Array)
164
- end
165
-
166
- it "returns the longest words" do
167
- counter = Counter.new("Those whom the gods love grow young.")
168
- expect(counter.longest_words).to eq([["those", 5],["young", 5]])
169
- end
170
- end
171
-
172
- describe "word_density" do
173
- it "returns an array" do
174
- expect(counter.word_density).to be_a(Array)
175
- end
176
-
177
- it "returns words and their density in percent" do
178
- counter = Counter.new("His name was Major, major Major Major.")
179
- expect(counter.word_density).to eq([["major", 57.14], ["was", 14.29], ["name", 14.29], ["his", 14.29]])
53
+ describe "#token_density" do
54
+ it "returns a two-dimensional array where each member array is a token and its density in descending order" do
55
+ expected = [
56
+ ['three', 0.5], ['woot', 0.33], ['one', 0.17]
57
+ ]
58
+ expect(counter.token_density).to eq(expected)
180
59
  end
181
60
 
182
61
  it "accepts a precision" do
183
- counter = Counter.new("His name was Major, major Major Major.")
184
- expect(counter.word_density(4)).to eq([["major", 57.1429], ["was", 14.2857], ["name", 14.2857], ["his", 14.2857]])
62
+ expected = [
63
+ ['three', 0.5], ['woot', 0.3333], ['one', 0.1667]
64
+ ]
65
+ expect(counter.token_density(precision: 4)).to eq(expected)
185
66
  end
186
67
  end
187
68
 
188
- describe "char_count" do
189
- it "returns the number of chars in the passed in string" do
190
- counter = Counter.new("His name was Major, major Major Major.")
191
- expect(counter.char_count).to eq(30)
192
- end
193
-
194
- it "returns the number of chars in the passed in string after the filter is applied" do
195
- counter = Counter.new("His name was Major, major Major Major.", exclude: "Major")
196
- expect(counter.char_count).to eq(10)
197
- end
198
- end
199
-
200
- describe "average_chars_per_word" do
201
- it "returns the average number of chars per word" do
202
- counter = Counter.new("His name was major, Major Major Major.")
203
- expect(counter.average_chars_per_word).to eq(4.29)
204
- end
205
-
206
- it "returns the average number of chars per word after the filter is applied" do
207
- counter = Counter.new("His name was Major, Major Major Major.", exclude: "Major")
208
- expect(counter.average_chars_per_word).to eq(3.33)
209
- end
210
-
211
- it "accepts precision" do
212
- counter = Counter.new("This line should have 39 characters minus spaces.")
213
- expect(counter.average_chars_per_word(4)).to eq(5.5714)
69
+ describe "#most_frequent_tokens" do
70
+ it "returns a hash of the tokens with the highest frequency, where each key a token, and each value is its frequency" do
71
+ expected = {
72
+ 'three' => 3
73
+ }
74
+ expect(counter.most_frequent_tokens).to eq(expected)
214
75
  end
215
76
  end
216
77
 
217
- describe "unique_word_count" do
218
- it "returns the number of unique words" do
219
- expect(counter.unique_word_count).to eq(13)
220
- end
221
-
222
- it "is case insensitive" do
223
- counter = Counter.new("Up down. Down up.")
224
- expect(counter.unique_word_count).to eq(2)
78
+ describe "#longest_tokens" do
79
+ it "returns a hash of the tokens with the highest length, where each key a token, and each value is its length" do
80
+ expected = {
81
+ 'three' => 5
82
+ }
83
+ expect(counter.longest_tokens).to eq(expected)
225
84
  end
226
85
  end
227
86
  end
228
-
229
- describe "count" do
230
- it "returns count for a single word" do
231
- counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
232
- expect(counter.count("i")).to eq(3)
233
- end
234
- end
235
-
236
- describe "from_file" do
237
- it "opens and reads a text file" do
238
- counter = WordsCounted.from_file('spec/support/the_hart_and_the_hunter.txt')
239
- expect(counter.word_count).to eq(139)
240
- end
241
- end
242
87
  end
@@ -0,0 +1,99 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "../spec_helper"
3
+
4
+ module WordsCounted
5
+ warn "Methods being tested are deprecated"
6
+
7
+ describe Counter do
8
+ let(:counter) do
9
+ tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
10
+ Counter.new(tokens)
11
+ end
12
+
13
+ describe "#word_density" do
14
+ it "returns words and their density in percent" do
15
+ expected = [
16
+ ['three', 50.0], ['woot', 33.33], ['one', 16.67]
17
+ ]
18
+ expect(counter.word_density).to eq(expected)
19
+ end
20
+
21
+ it "accepts a precision" do
22
+ expected = [
23
+ ['three', 50.0], ['woot', 33.3333], ['one', 16.6667]
24
+ ]
25
+ expect(counter.word_density(4)).to eq(expected)
26
+ end
27
+ end
28
+
29
+ describe "#word_occurrences" do
30
+ it "returns a two dimensional array sorted by descending word occurrence" do
31
+ expected = {
32
+ 'three' => 3, 'woot' => 2, 'one' => 1
33
+ }
34
+ expect(counter.word_occurrences).to eq(expected)
35
+ end
36
+ end
37
+
38
+ describe "#sorted_word_occurrences" do
39
+ it "returns a two dimensional array sorted by descending word occurrence" do
40
+ expected = [
41
+ ['three', 3], ['woot', 2], ['one', 1]
42
+ ]
43
+ expect(counter.sorted_word_occurrences).to eq(expected)
44
+ end
45
+ end
46
+
47
+ describe "#word_lengths" do
48
+ it "returns a hash of of words and their length sorted descending by length" do
49
+ expected = {
50
+ 'three' => 5, 'woot' => 4, 'one' => 3
51
+ }
52
+ expect(counter.word_lengths).to eq(expected)
53
+ end
54
+ end
55
+
56
+ describe "#sorted_word_lengths" do
57
+ it "returns a two dimensional array sorted by descending word length" do
58
+ expected = [
59
+ ['three', 5], ['woot', 4], ['one', 3]
60
+ ]
61
+ expect(counter.sorted_word_lengths).to eq(expected)
62
+ end
63
+ end
64
+
65
+ describe "#longest_words" do
66
+ it "returns a two-dimentional array of the longest words and their lengths" do
67
+ expected = [
68
+ ['three', 5]
69
+ ]
70
+ expect(counter.longest_words).to eq(expected)
71
+ end
72
+ end
73
+
74
+ describe "#most_occurring_words" do
75
+ it "returns a two-dimentional array of words with the highest frequency and their frequencies" do
76
+ expected = [
77
+ ['three', 3]
78
+ ]
79
+ expect(counter.most_occurring_words).to eq(expected)
80
+ end
81
+ end
82
+
83
+ describe "#average_chars_per_word" do
84
+ it "returns the average number of chars per word" do
85
+ expect(counter.average_chars_per_word).to eq(4.33)
86
+ end
87
+
88
+ it "accepts precision" do
89
+ expect(counter.average_chars_per_word(4)).to eq(4.3333)
90
+ end
91
+ end
92
+
93
+ describe "#count" do
94
+ it "returns count for a single word" do
95
+ expect(counter.count('one')).to eq(1)
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,133 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "../spec_helper"
3
+
4
+ module WordsCounted
5
+ describe Tokeniser do
6
+ describe "initialize" do
7
+ it "sets @input" do
8
+ tokeniser = Tokeniser.new("Hello World!")
9
+ expect(tokeniser.instance_variables).to include(:@input)
10
+ end
11
+ end
12
+
13
+ describe "#tokenise" do
14
+ it "normalises tokens and returns an array" do
15
+ tokens = Tokeniser.new("Hello HELLO").tokenise
16
+ expect(tokens).to eq(%w[hello hello])
17
+ end
18
+
19
+ context "without arguments" do
20
+ it "removes none alpha-numeric chars" do
21
+ tokens = Tokeniser.new("Hello world! # $ % 12345 * & % ?").tokenise
22
+ expect(tokens).to eq(%w[hello world])
23
+ end
24
+
25
+ it "does not split on hyphens" do
26
+ tokens = Tokeniser.new("I am twenty-two.").tokenise
27
+ expect(tokens).to eq(%w[i am twenty-two])
28
+ end
29
+
30
+ it "does not split on apostrophe" do
31
+ tokens = Tokeniser.new("Bust 'em! It's Jim's gang.").tokenise
32
+ expect(tokens).to eq(%w[bust 'em it's jim's gang])
33
+ end
34
+
35
+ it "does not split on unicode chars" do
36
+ tokens = Tokeniser.new("Bayrūt").tokenise
37
+ expect(tokens).to eq(%w[bayrūt])
38
+ end
39
+ end
40
+
41
+ context "with `pattern` options" do
42
+ it "splits on accepts a custom pattern" do
43
+ tokens = Tokeniser.new("We-Are-ALL").tokenise(pattern: /[^-]+/)
44
+ expect(tokens).to eq(%w[we are all])
45
+ end
46
+ end
47
+
48
+ context "with `exclude` option" do
49
+ context "as a string" do
50
+ let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
51
+
52
+ it "it accepts a string filter" do
53
+ tokens = tokeniser.tokenise(exclude: "magnificent")
54
+ expect(tokens).to eq(%w[that was trevor])
55
+ end
56
+
57
+ it "accepts a string filter with multiple space-delimited tokens" do
58
+ tokens = tokeniser.tokenise(exclude: "was magnificent")
59
+ expect(tokens).to eq(%w[that trevor])
60
+ end
61
+
62
+ it "normalises string filter" do
63
+ tokens = tokeniser.tokenise(exclude: "MAGNIFICENT")
64
+ expect(tokens).to eq(%w[that was trevor])
65
+ end
66
+ end
67
+
68
+ context "as a regular expression" do
69
+ it "filters on match" do
70
+ tokeniser = Tokeniser.new("That was magnificent, Trevor.")
71
+ tokens = tokeniser.tokenise(exclude: /magnificent/i)
72
+ expect(tokens).to eq(%w[that was trevor])
73
+ end
74
+ end
75
+
76
+ context "as a lambda" do
77
+ it "calls lambda" do
78
+ tokeniser = Tokeniser.new("That was magnificent, Trevor.")
79
+ tokens = tokeniser.tokenise(exclude: ->(token) { token.length < 5 })
80
+ expect(tokens).to eq(%w[magnificent trevor])
81
+ end
82
+
83
+ it "accepts a symbol for shorthand notation" do
84
+ tokeniser = Tokeniser.new("That was magnificent, محمد.}")
85
+ tokens = tokeniser.tokenise(exclude: :ascii_only?)
86
+ expect(tokens).to eq(%w[محمد])
87
+ end
88
+ end
89
+
90
+ context "as an array" do
91
+ let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
92
+
93
+ it "accepts an array of strings" do
94
+ tokens = tokeniser.tokenise(exclude: ["magnificent"])
95
+ expect(tokens).to eq(%w[that was trevor])
96
+ end
97
+
98
+ it "accepts an array regular expressions" do
99
+ tokens = tokeniser.tokenise(exclude: [/that/, /was/])
100
+ expect(tokens).to eq(%w[magnificent trevor])
101
+ end
102
+
103
+ it "accepts an array of lambdas" do
104
+ filters = [
105
+ ->(token) { token.length < 4 },
106
+ ->(token) { token.length > 6 }
107
+ ]
108
+ tokens = tokeniser.tokenise(exclude: filters)
109
+ expect(tokens).to eq(%w[that trevor])
110
+ end
111
+
112
+ it "accepts a mixed array" do
113
+ filters = [
114
+ "that",
115
+ ->(token) { token.length < 4 },
116
+ /magnificent/
117
+ ]
118
+ tokens = tokeniser.tokenise(exclude: filters)
119
+ expect(tokens).to eq(["trevor"])
120
+ end
121
+ end
122
+
123
+ context "with an invalid filter" do
124
+ it "raises an `ArgumentError`" do
125
+ expect {
126
+ Tokeniser.new("Hello world!").tokenise(exclude: 1)
127
+ }.to raise_error(ArgumentError)
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,34 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "spec_helper"
3
+
4
+ describe WordsCounted do
5
+ describe ".from_file" do
6
+ let(:file_path) { "spec/support/the_hart_and_the_hunter.txt" }
7
+
8
+ it "opens and reads a text file" do
9
+ counter = WordsCounted.from_file(file_path)
10
+ expect(counter.token_count).to eq(139)
11
+ end
12
+
13
+ it "opens and reads a text file with options" do
14
+ counter = WordsCounted.from_file(file_path, exclude: "hunter")
15
+ expect(counter.token_count).to eq(135)
16
+ end
17
+ end
18
+
19
+ describe ".count" do
20
+ let(:string) do
21
+ "We are all in the gutter, but some of us are looking at the stars."
22
+ end
23
+
24
+ it "returns a counter instance with given input as tokens" do
25
+ counter = WordsCounted.count(string)
26
+ expect(counter.token_count).to eq(15)
27
+ end
28
+
29
+ it "returns a counter instance with given input and options" do
30
+ counter = WordsCounted.count(string, exclude: "the gutter")
31
+ expect(counter.token_count).to eq(12)
32
+ end
33
+ end
34
+ end
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.version = WordsCounted::VERSION
10
10
  spec.authors = ["Mohamad El-Husseini"]
11
11
  spec.email = ["husseini.mel@gmail.com"]
12
- spec.description = %q{A Ruby word counter and string analyser with helpful utility methods.}
12
+ spec.description = %q{A Ruby natural language processor to extract stats from text, such was word count and more.}
13
13
  spec.summary = %q{See README.}
14
14
  spec.homepage = "https://github.com/abitdodgy/words_counted"
15
15
  spec.license = "MIT"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: words_counted
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mohamad El-Husseini
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-02 00:00:00.000000000 Z
11
+ date: 2015-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,7 +66,8 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- description: A Ruby word counter and string analyser with helpful utility methods.
69
+ description: A Ruby natural language processor to extract stats from text, such was
70
+ word count and more.
70
71
  email:
71
72
  - husseini.mel@gmail.com
72
73
  executables: []
@@ -74,19 +75,28 @@ extensions: []
74
75
  extra_rdoc_files: []
75
76
  files:
76
77
  - ".gitignore"
78
+ - ".hound.yml"
77
79
  - ".rspec"
80
+ - ".ruby-style.yml"
81
+ - ".travis.yml"
78
82
  - ".yardopts"
79
83
  - CHANGELOG.md
80
84
  - Gemfile
81
85
  - LICENSE.txt
82
86
  - README.md
83
87
  - Rakefile
88
+ - lib/refinements/hash_refinements.rb
84
89
  - lib/words_counted.rb
85
90
  - lib/words_counted/counter.rb
91
+ - lib/words_counted/deprecated.rb
92
+ - lib/words_counted/tokeniser.rb
86
93
  - lib/words_counted/version.rb
87
94
  - spec/spec_helper.rb
88
95
  - spec/support/the_hart_and_the_hunter.txt
89
96
  - spec/words_counted/counter_spec.rb
97
+ - spec/words_counted/deprecated_spec.rb
98
+ - spec/words_counted/tokeniser_spec.rb
99
+ - spec/words_counted_spec.rb
90
100
  - words_counted.gemspec
91
101
  homepage: https://github.com/abitdodgy/words_counted
92
102
  licenses:
@@ -108,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
118
  version: '0'
109
119
  requirements: []
110
120
  rubyforge_project:
111
- rubygems_version: 2.2.2
121
+ rubygems_version: 2.4.5
112
122
  signing_key:
113
123
  specification_version: 4
114
124
  summary: See README.
@@ -116,3 +126,6 @@ test_files:
116
126
  - spec/spec_helper.rb
117
127
  - spec/support/the_hart_and_the_hunter.txt
118
128
  - spec/words_counted/counter_spec.rb
129
+ - spec/words_counted/deprecated_spec.rb
130
+ - spec/words_counted/tokeniser_spec.rb
131
+ - spec/words_counted_spec.rb