words_counted 0.1.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cba04e2004b13b0ee7b99e46cdf6549f6aebe2f6
4
- data.tar.gz: 885d494f7f2b2af40f59ed08aaca1db7ec89a54b
3
+ metadata.gz: d6302c1802d7da076d1ddafdcbe70e46a89c8f33
4
+ data.tar.gz: 873efaa5e58f883e0dde99094ca53952d46217c7
5
5
  SHA512:
6
- metadata.gz: e2009cd4b401da2b43047699a073a3f541654384d831d73c0d436016eb88325e29c179a59961c6d1d8d48a865f34a2da78e014a28a5e0cf4ccf714cafa7a6bb5
7
- data.tar.gz: f46e0031db714c0985ef4b2dee5d1f294c9ab0bdb629157110af0b26b76280bfe440207b4f6920156681cc91ded0246e3e66b6dcf26717208cc73ebbe4e86821
6
+ metadata.gz: 0e6ddb8db9c060432066d86aed2efe20aa95dee2019d54c950007170c0ffbbcff16fa27a0377419b0d1b718be1625a4376ee9c687a4ae67073aaffe9ef363157
7
+ data.tar.gz: 9df2a0cefe14b9ac77d1741f8980d1b1fb4d8b770738fbd69c8870f73da4b653a1d9462ac8813f88dc48af36e03718773523985f5be0f4999177a6b0a2a89662
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .idea/
data/.hound.yml ADDED
@@ -0,0 +1,2 @@
1
+ ruby:
2
+ config_file: .ruby-style.yml
data/.ruby-style.yml ADDED
@@ -0,0 +1,2 @@
1
+ Style/IfUnlessModifier:
2
+ MaxLineLength: 120
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.1
5
+ - 2.2
6
+ - ruby-head
7
+
8
+ gemfile:
9
+ - Gemfile
data/.yardopts CHANGED
@@ -1,3 +1,4 @@
1
- --title 'Word Counter for Ruby'
1
+ --title 'Ruby natural language processor'
2
2
  --private
3
- --markup markdown
3
+ --markup markdown
4
+ --hide-api private
data/CHANGELOG.md CHANGED
@@ -1,3 +1,27 @@
1
+ ## Version 1.0
2
+
3
+ This version brings lots of improvements to code organisation. The tokeniser has been extracted into its own class. All methods in `Counter` have either renamed or deprecated. Deprecated methods and their tests have moved into their own modules. Using them will trigger warnings with upgrade instructions outlined below.
4
+
5
+ 1. Extracted tokenisation behaviour from `Counter` into a `Tokeniser` class.
6
+ 2. Deprecated all methods that have `word` in their name. Most are renamed such that `word` became `token`. They will be removed in version 1.1.
7
+ - Deprecated `word_count` in favor of `token_count`
8
+ - Deprecated `unique_word_count` in favor of `unique_token_count`
9
+ - Deprecated `word_occurrences` and `sorted_word_occurrences` in favor of `token_frequency`
10
+ - Deprecated `word_lengths` and `sorted_word_lengths` in favor of `token_lenghts`
11
+ - Deprecated `word_density` in favor of `token_density`
12
+ - Deprecated `most_occurring_words` in favor of `most_frequent_tokens`
13
+ - Deprecated `longest_words` in favor of `longest_tokens`
14
+ - Deprecated `average_chars_per_word` in favor of `average_chars_per_token`
15
+ - Deprecated `count`. Use `Array#count` instead.
16
+ 3. `token_lengths`, which replaces `word_lengths` returns a sorted two-dimensional array instead of a hash. It behaves exactly like `sorted_word_lengths` which has been deprecated. Use `token_lengths.to_h` for old behaviour.
17
+ 4. `token_frequency`, which replaces `word_occurences` returns a sorted two-dimensional array instead of a hash. It behaves like `sorted_word_occurrences` which has been deprecated. Use `token_frequency.to_h` for old behaviour.
18
+ 5. `token_density`, which replaces `word_density`, returns a decimal with a precision of 2, not a percent. Use `token_density * 100` for old behaviour.
19
+ 6. Add a refinement to Hash under `lib/refinements/hash_refinements.rb` to quickly sort by descending value.
20
+ 7. Extracted all deprecated methods to their own module, and their tests to their own spec file.
21
+ 8. Added a base `words_counted_spec.rb` and moved `.from_file` test to the new file.
22
+ 9. Added Travis continuous integration.
23
+ 10. Add documentation to the code.
24
+
1
25
  ## Version 0.1.5
2
26
 
3
27
  1. Removed `to_f` from the dividend in `average_chars_per_word` and `word_densities`. The divisor is a float, and dividing by a float returns a float.
@@ -0,0 +1,10 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Refinements
3
+ module HashRefinements
4
+ refine Hash do
5
+ def sort_by_value_desc
6
+ sort_by(&:last).reverse
7
+ end
8
+ end
9
+ end
10
+ end
@@ -1,96 +1,128 @@
1
1
  # -*- encoding : utf-8 -*-
2
- module WordsCounted
3
- class Counter
4
- attr_reader :words, :word_occurrences, :word_lengths, :char_count
5
-
6
- WORD_REGEXP = /[\p{Alpha}\-']+/
7
2
 
8
- def self.from_file(path, options = {})
9
- File.open(path) do |file|
10
- new file.read, options
11
- end
12
- end
13
-
14
- def initialize(string, options = {})
15
- @options = options
16
- exclude = filter_proc(options[:exclude])
17
- @words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
18
- @char_count = words.join.size
19
- @word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
20
- @word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
21
- end
3
+ require "words_counted/deprecated"
22
4
 
23
- def word_count
24
- words.size
25
- end
5
+ module WordsCounted
6
+ using Refinements::HashRefinements
26
7
 
27
- def unique_word_count
28
- words.uniq.size
29
- end
8
+ class Counter
9
+ include Deprecated
30
10
 
31
- def average_chars_per_word(precision = 2)
32
- (char_count / word_count.to_f).round(precision)
33
- end
11
+ attr_reader :tokens
34
12
 
35
- def most_occurring_words
36
- highest_ranking word_occurrences
13
+ def initialize(tokens)
14
+ @tokens = tokens
37
15
  end
38
16
 
39
- def longest_words
40
- highest_ranking word_lengths
17
+ # Returns the number of tokens.
18
+ #
19
+ # @example
20
+ # Counter.new(%w[one two two three three three]).token_count
21
+ # # => 6
22
+ #
23
+ # @return [Integer] The number of tokens.
24
+ def token_count
25
+ tokens.size
41
26
  end
42
27
 
43
- def word_density(precision = 2)
44
- word_densities = word_occurrences.each_with_object({}) do |(word, occ), hash|
45
- hash[word] = (occ / word_count.to_f * 100).round(precision)
46
- end
47
- sort_by_descending_value word_densities
28
+ # Returns the number of unique tokens.
29
+ #
30
+ # @example
31
+ # Counter.new(%w[one two two three three three]).uniq_token_count
32
+ # # => 3
33
+ #
34
+ # @return [Integer] The number of unique tokens.
35
+ def uniq_token_count
36
+ tokens.uniq.size
48
37
  end
49
38
 
50
- def sorted_word_occurrences
51
- sort_by_descending_value word_occurrences
39
+ # Returns the character count of all tokens.
40
+ #
41
+ # @example
42
+ # Counter.new(%w[one two]).char_count
43
+ # # => 6
44
+ #
45
+ # @return [Integer] The total char count of tokens.
46
+ def char_count
47
+ tokens.join.size
52
48
  end
53
49
 
54
- def sorted_word_lengths
55
- sort_by_descending_value word_lengths
50
+ # Returns a sorted two-dimensional array where each member array is a token and its frequency.
51
+ # The array is sorted by frequency in descending order.
52
+ #
53
+ # @example
54
+ # Counter.new(%w[one two two three three three]).token_frequency
55
+ # # => [ ['three', 3], ['two', 2], ['one', 1] ]
56
+ #
57
+ # @return [Array<Array<String, Integer>>]
58
+ def token_frequency
59
+ tokens.each_with_object(Hash.new(0)) { |token, hash| hash[token] += 1 }.sort_by_value_desc
56
60
  end
57
61
 
58
- def count(match)
59
- words.select { |word| word == match.downcase }.size
62
+ # Returns a sorted two-dimensional array where each member array is a token and its length.
63
+ # The array is sorted by length in descending order.
64
+ #
65
+ # @example
66
+ # Counter.new(%w[one two three four five]).token_lenghts
67
+ # # => [ ['three', 5], ['four', 4], ['five', 4], ['one', 3], ['two', 3] ]
68
+ #
69
+ # @return [Array<Array<String, Integer>>]
70
+ def token_lengths
71
+ tokens.uniq.each_with_object({}) { |token, hash| hash[token] = token.length }.sort_by_value_desc
60
72
  end
61
73
 
62
- private
63
-
64
- def highest_ranking(entries)
65
- entries.group_by { |_, value| value }.sort.last.last
74
+ # Returns a sorted two-dimensional array where each member array is a token and its density
75
+ # as a float, rounded to a precision of two decimal places. It accepts a precision argument
76
+ # which defaults to `2`.
77
+ #
78
+ # @example
79
+ # Counter.new(%w[Maj. Major Major Major]).token_density
80
+ # # => [ ['major', .75], ['maj', .25] ]
81
+ #
82
+ # @example with `precision`
83
+ # Counter.new(%w[Maj. Major Major Major]).token_density(precision: 4)
84
+ # # => [ ['major', .7500], ['maj', .2500] ]
85
+ #
86
+ # @param [Integer] precision The number of decimal places to round density to.
87
+ # @return [Array<Array<String, Float>>]
88
+ def token_density(precision: 2)
89
+ token_frequency.each_with_object({}) { |(token, freq), hash|
90
+ hash[token] = (freq / token_count.to_f).round(precision)
91
+ }.sort_by_value_desc
66
92
  end
67
93
 
68
- def sort_by_descending_value(entries)
69
- entries.sort_by { |_, value| value }.reverse
94
+ # Returns a hash of tokens and their frequencies for tokens with the highest frequency.
95
+ #
96
+ # @example
97
+ # Counter.new(%w[one once two two twice twice]).most_frequent_tokens
98
+ # # => { 'two' => 2, 'twice' => 2 }
99
+ #
100
+ # @return [Hash<String, Integer>]
101
+ def most_frequent_tokens
102
+ token_frequency.group_by(&:last).max.last.to_h
70
103
  end
71
104
 
72
- def regexp
73
- @options[:regexp] || WORD_REGEXP
105
+ # Returns a hash of tokens and their lengths for tokens with the highest length.
106
+ #
107
+ # @example
108
+ # Counter.new(%w[one three five seven]).longest_tokens
109
+ # # => { 'three' => 5, 'seven' => 5 }
110
+ #
111
+ # @return [Hash<String, Integer>]
112
+ def longest_tokens
113
+ token_lengths.group_by(&:last).max.last.to_h
74
114
  end
75
115
 
76
- def filter_proc(filter)
77
- if filter.respond_to?(:to_a)
78
- filter_procs = Array(filter).map(&method(:filter_proc))
79
- ->(word) {
80
- filter_procs.any? { |p| p.call(word) }
81
- }
82
- elsif filter.respond_to?(:to_str)
83
- exclusion_list = filter.split.collect(&:downcase)
84
- ->(word) {
85
- exclusion_list.include?(word)
86
- }
87
- elsif regexp_filter = Regexp.try_convert(filter)
88
- Proc.new { |word| word =~ regexp_filter }
89
- elsif filter.respond_to?(:to_proc)
90
- filter.to_proc
91
- else
92
- raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
93
- end
116
+ # Returns the average char count per token rounded to a precision of two decimal places.
117
+ # Accepts a `precision` argument.
118
+ #
119
+ # @example
120
+ # Counter.new(%w[one three five seven]).average_chars_per_token
121
+ # # => 4.25
122
+ #
123
+ # @return [Float] The average char count per token.
124
+ def average_chars_per_token(precision: 2)
125
+ (char_count / token_count.to_f).round(precision)
94
126
  end
95
127
  end
96
128
  end
@@ -0,0 +1,76 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ module Deprecated
4
+ # @deprecated use `Counter#token_count`
5
+ def word_count
6
+ warn "`Counter#word_count` is deprecated, please use `Counter#token_count`"
7
+ token_count
8
+ end
9
+
10
+ # @deprecated use `Counter#uniq_token_count`
11
+ def unique_word_count
12
+ warn "`Counter#unique_word_count` is deprecated, please use `Counter#uniq_token_count`"
13
+ uniq_token_count
14
+ end
15
+
16
+ # @deprecated use `Counter#token_frequency`
17
+ def word_occurrences
18
+ warn "`Counter#word_occurrences` is deprecated, please use `Counter#token_frequency`"
19
+ warn "`Counter#token_frequency` returns a sorted array of arrays, not a hash. Call `token_frequency.to_h` for old behaviour"
20
+ token_frequency.to_h
21
+ end
22
+
23
+ # @deprecated use `Counter#token_lengths`
24
+ def word_lengths
25
+ warn "`Counter#word_lengths` is deprecated, please use `Counter#token_lengths`"
26
+ warn "`Counter#token_lengths` returns a sorted array of arrays, not a hash. Call `token_lengths.to_h` for old behaviour"
27
+ token_lengths.to_h
28
+ end
29
+
30
+ # @deprecated use `Counter#token_density`
31
+ def word_density(precision = 2)
32
+ warn "`Counter#word_density` is deprecated, please use `Counter#token_density`"
33
+ warn "`Counter#token_density` returns density as decimal and not percent"
34
+
35
+ token_density(precision: precision * 2).map { |tuple| [tuple.first, (tuple.last * 100).round(precision)] }
36
+ end
37
+
38
+ # @deprecated use `Counter#token_frequency`
39
+ def sorted_word_occurrences
40
+ warn "`Counter#sorted_word_occurrences` is deprecated, please use `Counter#token_frequency`"
41
+ token_frequency
42
+ end
43
+
44
+ # @deprecated use `Counter#token_lengths`
45
+ def sorted_word_lengths
46
+ warn "`Counter#sorted_word_lengths` is deprecated, please use `Counter#token_lengths`"
47
+ token_lengths
48
+ end
49
+
50
+ # @deprecated use `Counter#most_frequent_tokens`
51
+ def most_occurring_words
52
+ warn "`Counter#most_occurring_words` is deprecated, please use `Counter#most_frequent_tokens`"
53
+ warn "`Counter#most_frequent_tokens` returns a hash, not an array. Call `most_frequent_tokens.to_h` for old behaviour."
54
+ most_frequent_tokens.to_a
55
+ end
56
+
57
+ # @deprecated use `Counter#longest_tokens`
58
+ def longest_words
59
+ warn "`Counter#longest_words` is deprecated, please use `Counter#longest_tokens`"
60
+ warn "`Counter#longest_tokens` returns a hash, not an array. Call `longest_tokens.to_h` for old behaviour."
61
+ longest_tokens.to_a
62
+ end
63
+
64
+ # @deprecated use `Counter#average_chars_per_token`
65
+ def average_chars_per_word(precision = 2)
66
+ warn "`Counter#average_chars_per_word` is deprecated, please use `Counter#average_chars_per_token`"
67
+ average_chars_per_token(precision: precision)
68
+ end
69
+
70
+ # @deprecated use `Counter#average_chars_per_token`
71
+ def count(token)
72
+ warn "`Counter#count` is deprecated, please use `Array#count`"
73
+ tokens.count(token.downcase)
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,139 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module WordsCounted
3
+ class Tokeniser
4
+ # Takes a string and breaks it into an array of tokens.
5
+ # Using `pattern` and `exclude` allows for powerful tokenisation strategies.
6
+ #
7
+ # @example
8
+ # tokeniser = WordsCounted::Tokeniser.new("We are all in the gutter, but some of us are looking at the stars.")
9
+ # tokeniser.tokenise(exclude: "We are all in the gutter")
10
+ # # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
11
+
12
+ # Default tokenisation strategy
13
+ TOKEN_REGEXP = /[\p{Alpha}\-']+/
14
+
15
+ # Initialises state with a string that will be tokenised.
16
+ #
17
+ # @param [String] input The string to tokenise.
18
+ # @return [Tokeniser]
19
+ def initialize(input)
20
+ @input = input
21
+ end
22
+
23
+ # Converts a string into an array of tokens using a regular expression.
24
+ # If a regexp is not provided a default one is used. See {Tokenizer.TOKEN_REGEXP}.
25
+ #
26
+ # Use `exclude` to remove tokens from the final list. `exclude` can be a string,
27
+ # a regular expression, a lambda, a symbol, or an array of one or more of those types.
28
+ # This allows for powerful and flexible tokenisation strategies.
29
+ #
30
+ # @example
31
+ # WordsCounted::Tokeniser.new("Hello World").tokenise
32
+ # # => ['hello', 'world']
33
+ #
34
+ # @example With `pattern`
35
+ # WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
36
+ # # => ['hello', 'mohamad']
37
+ #
38
+ # @example With `exclude` as a string
39
+ # WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
40
+ # # => ['sami']
41
+ #
42
+ # @example With `exclude` as a regexp
43
+ # WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
44
+ # # => ['dani']
45
+ #
46
+ # @example With `exclude` as a lambda
47
+ # WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(exclude: ->(token) { token.length > 6 })
48
+ # # => ['sami']
49
+ #
50
+ # @example With `exclude` as a symbol
51
+ # WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
52
+ # # => ['محمد']
53
+ #
54
+ # @example With `exclude` as an array of strings
55
+ # WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(exclude: ["goodbye hello"])
56
+ # # => ['sami', 'and', dani']
57
+ #
58
+ # @example With `exclude` as an array of regular expressions
59
+ # WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(exclude: [/goodbye/i, /and/i])
60
+ # # => ['hello', 'dani']
61
+ #
62
+ # @example With `exclude` as an array of lambdas
63
+ # t = WordsCounted::Tokeniser.new("Special Agent 007")
64
+ # t.tokenise(exclude: [->(t) { t.to_i.odd? }, ->(t) { t.length > 5}])
65
+ # # => ['agent']
66
+ #
67
+ # @example With `exclude` as a mixed array
68
+ # t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
69
+ # t.tokenise(exclude: [:ascii_only?, /محمد/, ->(t) { t.length > 6}, "و"])
70
+ # # => => ["هي", "سامي", "ودان
71
+ #
72
+ # @param [Regexp] pattern The string to tokenise.
73
+ # @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol nil] exclude The filter to apply.
74
+ # @return [Array] the array of filtered tokens.
75
+ def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
76
+ filter_proc = filter_to_proc(exclude)
77
+ @input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
78
+ end
79
+
80
+ private
81
+
82
+ # This method converts any arguments into a callable object. The return value of this
83
+ # is then used to determine whether a token should be excluded from the final list or not.
84
+ #
85
+ # `filter` can be a string, a regular expression, a lambda, a symbol, or an array
86
+ # of any combination of those types.
87
+ #
88
+ # If `filter` is a string, see {Tokeniser#filter_proc_from_string}.
89
+ # If `filter` is a an array, see {Tokeniser#filter_procs_from_array}.
90
+ #
91
+ # If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
92
+ # is returned that checks the token for a match. If a symbol is passed, it is converted to
93
+ # a proc.
94
+ #
95
+ # This method depends on `nil` responding `to_a` with an empty array, which
96
+ # avoids having to check if `exclude` was passed.
97
+ #
98
+ # @api private
99
+ def filter_to_proc(filter)
100
+ if filter.respond_to?(:to_a)
101
+ filter_procs_from_array(filter)
102
+ elsif filter.respond_to?(:to_str)
103
+ filter_proc_from_string(filter)
104
+ elsif regexp_filter = Regexp.try_convert(filter)
105
+ ->(token) {
106
+ token =~ regexp_filter
107
+ }
108
+ elsif filter.respond_to?(:to_proc)
109
+ filter.to_proc
110
+ else
111
+ raise ArgumentError,
112
+ "`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
113
+ end
114
+ end
115
+
116
+ # Converts an array of `filters` to an array of lambdas, and returns a lambda that calls
117
+ # each lambda in the resulting array. If any lambda returns true the token is excluded
118
+ # from the final list.
119
+ #
120
+ # @api private
121
+ def filter_procs_from_array(filter)
122
+ filter_procs = Array(filter).map &method(:filter_to_proc)
123
+ ->(token) {
124
+ filter_procs.any? { |pro| pro.call(token) }
125
+ }
126
+ end
127
+
128
+ # Converts a string `filter` to an array, and returns a lambda
129
+ # that returns true if the token is included in the array.
130
+ #
131
+ # @api private
132
+ def filter_proc_from_string(filter)
133
+ normalized_exclusion_list = filter.split.map(&:downcase)
134
+ ->(token) {
135
+ normalized_exclusion_list.include?(token)
136
+ }
137
+ end
138
+ end
139
+ end
@@ -1,4 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module WordsCounted
3
- VERSION = "0.1.5"
3
+ VERSION = "1.0.0"
4
4
  end
data/lib/words_counted.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require "words_counted/version"
2
+ require "refinements/hash_refinements"
3
+
4
+ require "words_counted/tokeniser"
3
5
  require "words_counted/counter"
6
+ require "words_counted/version"
4
7
 
5
8
  begin
6
9
  require "pry"
@@ -9,10 +12,14 @@ end
9
12
 
10
13
  module WordsCounted
11
14
  def self.count(string, options = {})
12
- Counter.new(string, options)
15
+ tokens = Tokeniser.new(string).tokenise(options)
16
+ Counter.new(tokens)
13
17
  end
14
18
 
15
19
  def self.from_file(path, options = {})
16
- Counter.from_file(path, options)
20
+ tokens = File.open(path) do |file|
21
+ Tokeniser.new(file.read).tokenise(options)
22
+ end
23
+ Counter.new(tokens)
17
24
  end
18
25
  end
@@ -3,240 +3,85 @@ require_relative "../spec_helper"
3
3
 
4
4
  module WordsCounted
5
5
  describe Counter do
6
- let(:counter) { Counter.new("We are all in the gutter, but some of us are looking at the stars.") }
7
-
8
- describe "initialize" do
9
- it "sets @options" do
10
- expect(counter.instance_variables).to include(:@options)
11
- end
12
-
13
- it "sets @char_count" do
14
- expect(counter.instance_variables).to include(:@char_count)
15
- end
16
-
17
- it "sets @words" do
18
- expect(counter.instance_variables).to include(:@words)
19
- end
20
-
21
- it "sets @word_occurrences" do
22
- expect(counter.instance_variables).to include(:@word_occurrences)
23
- end
24
-
25
- it "sets @word_lengths" do
26
- expect(counter.instance_variables).to include(:@word_lengths)
27
- end
6
+ let(:counter) do
7
+ tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
8
+ Counter.new(tokens)
28
9
  end
29
10
 
30
- describe "words" do
31
- it "returns an array" do
32
- expect(counter.words).to be_a(Array)
33
- end
34
-
35
- it "splits words" do
36
- expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
37
- end
38
-
39
- it "removes special characters" do
40
- counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
41
- expect(counter.words).to eq(%w[hello how do you do])
42
- end
43
-
44
- it "counts hyphenated words as one" do
45
- counter = Counter.new("I am twenty-two.")
46
- expect(counter.words).to eq(%w[i am twenty-two])
47
- end
48
-
49
- it "does not split words on apostrophe" do
50
- counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
51
- expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
52
- end
53
-
54
- it "does not split on unicode chars" do
55
- counter = Counter.new("São Paulo")
56
- expect(counter.words).to eq(%w[são paulo])
57
- end
58
-
59
- it "it accepts a string filter" do
60
- counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
61
- expect(counter.words).to eq(%w[that was trevor])
62
- end
63
-
64
- it "it accepts a string filter with multiple words" do
65
- counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
66
- expect(counter.words).to eq(%w[that trevor])
67
- end
68
-
69
- it "filters words in uppercase when using a string filter" do
70
- counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
71
- expect(counter.words).to eq(%w[that was trevor])
72
- end
73
-
74
- it "accepts a regexp filter" do
75
- counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
76
- expect(counter.words).to eq(%w[that was trevor])
77
- end
78
-
79
- it "accepts an array filter" do
80
- counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
81
- expect(counter.words).to eq(%w[magnificent trevor])
82
- end
83
-
84
- it "accepts a lambda filter" do
85
- counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
86
- expect(counter.words).to eq(%w[was magnificent trevor])
87
- end
88
-
89
- it "accepts a custom regexp" do
90
- counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
91
- expect(counter.words).to eq(["i", "am", "007"])
92
- end
93
-
94
- it "char_count should be calculated after the filter is applied" do
95
- counter = Counter.new("I am Legend.", exclude: "I am")
96
- expect(counter.char_count).to eq(6)
97
- end
98
- end
99
-
100
- describe "word_count" do
101
- it "returns the correct word count" do
102
- expect(counter.word_count).to eq(15)
11
+ describe "initialize" do
12
+ it "sets @tokens" do
13
+ expect(counter.instance_variables).to include(:@tokens)
103
14
  end
104
15
  end
105
16
 
106
- describe "word_occurrences" do
107
- it "returns a hash" do
108
- expect(counter.word_occurrences).to be_a(Hash)
109
- end
110
-
111
- it "treats capitalized words as the same word" do
112
- counter = Counter.new("Bad, bad, piggy!")
113
- expect(counter.word_occurrences).to eq({ "bad" => 2, "piggy" => 1 })
17
+ describe "#token_count" do
18
+ it "returns the correct number of tokens" do
19
+ expect(counter.token_count).to eq(6)
114
20
  end
115
21
  end
116
22
 
117
- describe "sorted_word_occurrences" do
118
- it "returns an array" do
119
- expect(counter.sorted_word_occurrences).to be_a(Array)
120
- end
121
-
122
- it "returns a two dimensional array sorted by descending word occurrence" do
123
- counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
124
- expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
23
+ describe "#uniq_token_count" do
24
+ it "returns the number of unique token" do
25
+ expect(counter.uniq_token_count).to eq(3)
125
26
  end
126
27
  end
127
28
 
128
- describe "most_occurring_words" do
129
- it "returns an array" do
130
- expect(counter.most_occurring_words).to be_a(Array)
131
- end
132
-
133
- it "returns highest occuring words" do
134
- counter = Counter.new("Orange orange Apple apple banana")
135
- expect(counter.most_occurring_words).to eq([["orange", 2],["apple", 2]])
29
+ describe "#char_count" do
30
+ it "returns the correct number of chars" do
31
+ expect(counter.char_count).to eq(26)
136
32
  end
137
33
  end
138
34
 
139
- describe 'word_lengths' do
140
- it "returns a hash" do
141
- expect(counter.word_lengths).to be_a(Hash)
142
- end
143
-
144
- it "returns a hash of word lengths" do
145
- counter = Counter.new("One two three.")
146
- expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
35
+ describe "#token_frequency" do
36
+ it "returns a two-dimensional array where each member array is a token and its frequency in descending order" do
37
+ expected = [
38
+ ['three', 3], ['woot', 2], ['one', 1]
39
+ ]
40
+ expect(counter.token_frequency).to eq(expected)
147
41
  end
148
42
  end
149
43
 
150
- describe "sorted_word_lengths" do
151
- it "returns an array" do
152
- expect(counter.sorted_word_lengths).to be_a(Array)
153
- end
154
-
155
- it "returns a two dimensional array sorted by descending word length" do
156
- counter = Counter.new("I am not certain of that")
157
- expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
44
+ describe "#token_lengths" do
45
+ it "returns a two-dimensional array where each member array is a token and its length in descending order" do
46
+ expected = [
47
+ ['three', 5], ['woot', 4], ['one', 3]
48
+ ]
49
+ expect(counter.token_lengths).to eq(expected)
158
50
  end
159
51
  end
160
52
 
161
- describe "longest_words" do
162
- it "returns an array" do
163
- expect(counter.longest_words).to be_a(Array)
164
- end
165
-
166
- it "returns the longest words" do
167
- counter = Counter.new("Those whom the gods love grow young.")
168
- expect(counter.longest_words).to eq([["those", 5],["young", 5]])
169
- end
170
- end
171
-
172
- describe "word_density" do
173
- it "returns an array" do
174
- expect(counter.word_density).to be_a(Array)
175
- end
176
-
177
- it "returns words and their density in percent" do
178
- counter = Counter.new("His name was Major, major Major Major.")
179
- expect(counter.word_density).to eq([["major", 57.14], ["was", 14.29], ["name", 14.29], ["his", 14.29]])
53
+ describe "#token_density" do
54
+ it "returns a two-dimensional array where each member array is a token and its density in descending order" do
55
+ expected = [
56
+ ['three', 0.5], ['woot', 0.33], ['one', 0.17]
57
+ ]
58
+ expect(counter.token_density).to eq(expected)
180
59
  end
181
60
 
182
61
  it "accepts a precision" do
183
- counter = Counter.new("His name was Major, major Major Major.")
184
- expect(counter.word_density(4)).to eq([["major", 57.1429], ["was", 14.2857], ["name", 14.2857], ["his", 14.2857]])
62
+ expected = [
63
+ ['three', 0.5], ['woot', 0.3333], ['one', 0.1667]
64
+ ]
65
+ expect(counter.token_density(precision: 4)).to eq(expected)
185
66
  end
186
67
  end
187
68
 
188
- describe "char_count" do
189
- it "returns the number of chars in the passed in string" do
190
- counter = Counter.new("His name was Major, major Major Major.")
191
- expect(counter.char_count).to eq(30)
192
- end
193
-
194
- it "returns the number of chars in the passed in string after the filter is applied" do
195
- counter = Counter.new("His name was Major, major Major Major.", exclude: "Major")
196
- expect(counter.char_count).to eq(10)
197
- end
198
- end
199
-
200
- describe "average_chars_per_word" do
201
- it "returns the average number of chars per word" do
202
- counter = Counter.new("His name was major, Major Major Major.")
203
- expect(counter.average_chars_per_word).to eq(4.29)
204
- end
205
-
206
- it "returns the average number of chars per word after the filter is applied" do
207
- counter = Counter.new("His name was Major, Major Major Major.", exclude: "Major")
208
- expect(counter.average_chars_per_word).to eq(3.33)
209
- end
210
-
211
- it "accepts precision" do
212
- counter = Counter.new("This line should have 39 characters minus spaces.")
213
- expect(counter.average_chars_per_word(4)).to eq(5.5714)
69
+ describe "#most_frequent_tokens" do
70
+ it "returns a hash of the tokens with the highest frequency, where each key a token, and each value is its frequency" do
71
+ expected = {
72
+ 'three' => 3
73
+ }
74
+ expect(counter.most_frequent_tokens).to eq(expected)
214
75
  end
215
76
  end
216
77
 
217
- describe "unique_word_count" do
218
- it "returns the number of unique words" do
219
- expect(counter.unique_word_count).to eq(13)
220
- end
221
-
222
- it "is case insensitive" do
223
- counter = Counter.new("Up down. Down up.")
224
- expect(counter.unique_word_count).to eq(2)
78
+ describe "#longest_tokens" do
79
+ it "returns a hash of the tokens with the highest length, where each key a token, and each value is its length" do
80
+ expected = {
81
+ 'three' => 5
82
+ }
83
+ expect(counter.longest_tokens).to eq(expected)
225
84
  end
226
85
  end
227
86
  end
228
-
229
- describe "count" do
230
- it "returns count for a single word" do
231
- counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
232
- expect(counter.count("i")).to eq(3)
233
- end
234
- end
235
-
236
- describe "from_file" do
237
- it "opens and reads a text file" do
238
- counter = WordsCounted.from_file('spec/support/the_hart_and_the_hunter.txt')
239
- expect(counter.word_count).to eq(139)
240
- end
241
- end
242
87
  end
@@ -0,0 +1,99 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "../spec_helper"
3
+
4
+ module WordsCounted
5
+ warn "Methods being tested are deprecated"
6
+
7
+ describe Counter do
8
+ let(:counter) do
9
+ tokens = WordsCounted::Tokeniser.new("one three three three woot woot").tokenise
10
+ Counter.new(tokens)
11
+ end
12
+
13
+ describe "#word_density" do
14
+ it "returns words and their density in percent" do
15
+ expected = [
16
+ ['three', 50.0], ['woot', 33.33], ['one', 16.67]
17
+ ]
18
+ expect(counter.word_density).to eq(expected)
19
+ end
20
+
21
+ it "accepts a precision" do
22
+ expected = [
23
+ ['three', 50.0], ['woot', 33.3333], ['one', 16.6667]
24
+ ]
25
+ expect(counter.word_density(4)).to eq(expected)
26
+ end
27
+ end
28
+
29
+ describe "#word_occurrences" do
30
+ it "returns a two dimensional array sorted by descending word occurrence" do
31
+ expected = {
32
+ 'three' => 3, 'woot' => 2, 'one' => 1
33
+ }
34
+ expect(counter.word_occurrences).to eq(expected)
35
+ end
36
+ end
37
+
38
+ describe "#sorted_word_occurrences" do
39
+ it "returns a two dimensional array sorted by descending word occurrence" do
40
+ expected = [
41
+ ['three', 3], ['woot', 2], ['one', 1]
42
+ ]
43
+ expect(counter.sorted_word_occurrences).to eq(expected)
44
+ end
45
+ end
46
+
47
+ describe "#word_lengths" do
48
+ it "returns a hash of of words and their length sorted descending by length" do
49
+ expected = {
50
+ 'three' => 5, 'woot' => 4, 'one' => 3
51
+ }
52
+ expect(counter.word_lengths).to eq(expected)
53
+ end
54
+ end
55
+
56
+ describe "#sorted_word_lengths" do
57
+ it "returns a two dimensional array sorted by descending word length" do
58
+ expected = [
59
+ ['three', 5], ['woot', 4], ['one', 3]
60
+ ]
61
+ expect(counter.sorted_word_lengths).to eq(expected)
62
+ end
63
+ end
64
+
65
+ describe "#longest_words" do
66
+ it "returns a two-dimentional array of the longest words and their lengths" do
67
+ expected = [
68
+ ['three', 5]
69
+ ]
70
+ expect(counter.longest_words).to eq(expected)
71
+ end
72
+ end
73
+
74
+ describe "#most_occurring_words" do
75
+ it "returns a two-dimentional array of words with the highest frequency and their frequencies" do
76
+ expected = [
77
+ ['three', 3]
78
+ ]
79
+ expect(counter.most_occurring_words).to eq(expected)
80
+ end
81
+ end
82
+
83
+ describe "#average_chars_per_word" do
84
+ it "returns the average number of chars per word" do
85
+ expect(counter.average_chars_per_word).to eq(4.33)
86
+ end
87
+
88
+ it "accepts precision" do
89
+ expect(counter.average_chars_per_word(4)).to eq(4.3333)
90
+ end
91
+ end
92
+
93
+ describe "#count" do
94
+ it "returns count for a single word" do
95
+ expect(counter.count('one')).to eq(1)
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,133 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "../spec_helper"
3
+
4
+ module WordsCounted
5
+ describe Tokeniser do
6
+ describe "initialize" do
7
+ it "sets @input" do
8
+ tokeniser = Tokeniser.new("Hello World!")
9
+ expect(tokeniser.instance_variables).to include(:@input)
10
+ end
11
+ end
12
+
13
+ describe "#tokenise" do
14
+ it "normalises tokens and returns an array" do
15
+ tokens = Tokeniser.new("Hello HELLO").tokenise
16
+ expect(tokens).to eq(%w[hello hello])
17
+ end
18
+
19
+ context "without arguments" do
20
+ it "removes none alpha-numeric chars" do
21
+ tokens = Tokeniser.new("Hello world! # $ % 12345 * & % ?").tokenise
22
+ expect(tokens).to eq(%w[hello world])
23
+ end
24
+
25
+ it "does not split on hyphens" do
26
+ tokens = Tokeniser.new("I am twenty-two.").tokenise
27
+ expect(tokens).to eq(%w[i am twenty-two])
28
+ end
29
+
30
+ it "does not split on apostrophe" do
31
+ tokens = Tokeniser.new("Bust 'em! It's Jim's gang.").tokenise
32
+ expect(tokens).to eq(%w[bust 'em it's jim's gang])
33
+ end
34
+
35
+ it "does not split on unicode chars" do
36
+ tokens = Tokeniser.new("Bayrūt").tokenise
37
+ expect(tokens).to eq(%w[bayrūt])
38
+ end
39
+ end
40
+
41
+ context "with `pattern` options" do
42
+ it "splits on accepts a custom pattern" do
43
+ tokens = Tokeniser.new("We-Are-ALL").tokenise(pattern: /[^-]+/)
44
+ expect(tokens).to eq(%w[we are all])
45
+ end
46
+ end
47
+
48
+ context "with `exclude` option" do
49
+ context "as a string" do
50
+ let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
51
+
52
+ it "it accepts a string filter" do
53
+ tokens = tokeniser.tokenise(exclude: "magnificent")
54
+ expect(tokens).to eq(%w[that was trevor])
55
+ end
56
+
57
+ it "accepts a string filter with multiple space-delimited tokens" do
58
+ tokens = tokeniser.tokenise(exclude: "was magnificent")
59
+ expect(tokens).to eq(%w[that trevor])
60
+ end
61
+
62
+ it "normalises string filter" do
63
+ tokens = tokeniser.tokenise(exclude: "MAGNIFICENT")
64
+ expect(tokens).to eq(%w[that was trevor])
65
+ end
66
+ end
67
+
68
+ context "as a regular expression" do
69
+ it "filters on match" do
70
+ tokeniser = Tokeniser.new("That was magnificent, Trevor.")
71
+ tokens = tokeniser.tokenise(exclude: /magnificent/i)
72
+ expect(tokens).to eq(%w[that was trevor])
73
+ end
74
+ end
75
+
76
+ context "as a lambda" do
77
+ it "calls lambda" do
78
+ tokeniser = Tokeniser.new("That was magnificent, Trevor.")
79
+ tokens = tokeniser.tokenise(exclude: ->(token) { token.length < 5 })
80
+ expect(tokens).to eq(%w[magnificent trevor])
81
+ end
82
+
83
+ it "accepts a symbol for shorthand notation" do
84
+ tokeniser = Tokeniser.new("That was magnificent, محمد.}")
85
+ tokens = tokeniser.tokenise(exclude: :ascii_only?)
86
+ expect(tokens).to eq(%w[محمد])
87
+ end
88
+ end
89
+
90
+ context "as an array" do
91
+ let(:tokeniser) { Tokeniser.new("That was magnificent, Trevor.") }
92
+
93
+ it "accepts an array of strings" do
94
+ tokens = tokeniser.tokenise(exclude: ["magnificent"])
95
+ expect(tokens).to eq(%w[that was trevor])
96
+ end
97
+
98
+ it "accepts an array regular expressions" do
99
+ tokens = tokeniser.tokenise(exclude: [/that/, /was/])
100
+ expect(tokens).to eq(%w[magnificent trevor])
101
+ end
102
+
103
+ it "accepts an array of lambdas" do
104
+ filters = [
105
+ ->(token) { token.length < 4 },
106
+ ->(token) { token.length > 6 }
107
+ ]
108
+ tokens = tokeniser.tokenise(exclude: filters)
109
+ expect(tokens).to eq(%w[that trevor])
110
+ end
111
+
112
+ it "accepts a mixed array" do
113
+ filters = [
114
+ "that",
115
+ ->(token) { token.length < 4 },
116
+ /magnificent/
117
+ ]
118
+ tokens = tokeniser.tokenise(exclude: filters)
119
+ expect(tokens).to eq(["trevor"])
120
+ end
121
+ end
122
+
123
+ context "with an invalid filter" do
124
+ it "raises an `ArgumentError`" do
125
+ expect {
126
+ Tokeniser.new("Hello world!").tokenise(exclude: 1)
127
+ }.to raise_error(ArgumentError)
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,34 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative "spec_helper"
3
+
4
+ describe WordsCounted do
5
+ describe ".from_file" do
6
+ let(:file_path) { "spec/support/the_hart_and_the_hunter.txt" }
7
+
8
+ it "opens and reads a text file" do
9
+ counter = WordsCounted.from_file(file_path)
10
+ expect(counter.token_count).to eq(139)
11
+ end
12
+
13
+ it "opens and reads a text file with options" do
14
+ counter = WordsCounted.from_file(file_path, exclude: "hunter")
15
+ expect(counter.token_count).to eq(135)
16
+ end
17
+ end
18
+
19
+ describe ".count" do
20
+ let(:string) do
21
+ "We are all in the gutter, but some of us are looking at the stars."
22
+ end
23
+
24
+ it "returns a counter instance with given input as tokens" do
25
+ counter = WordsCounted.count(string)
26
+ expect(counter.token_count).to eq(15)
27
+ end
28
+
29
+ it "returns a counter instance with given input and options" do
30
+ counter = WordsCounted.count(string, exclude: "the gutter")
31
+ expect(counter.token_count).to eq(12)
32
+ end
33
+ end
34
+ end
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.version = WordsCounted::VERSION
10
10
  spec.authors = ["Mohamad El-Husseini"]
11
11
  spec.email = ["husseini.mel@gmail.com"]
12
- spec.description = %q{A Ruby word counter and string analyser with helpful utility methods.}
12
+ spec.description = %q{A Ruby natural language processor to extract stats from text, such was word count and more.}
13
13
  spec.summary = %q{See README.}
14
14
  spec.homepage = "https://github.com/abitdodgy/words_counted"
15
15
  spec.license = "MIT"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: words_counted
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mohamad El-Husseini
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-02 00:00:00.000000000 Z
11
+ date: 2015-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,7 +66,8 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- description: A Ruby word counter and string analyser with helpful utility methods.
69
+ description: A Ruby natural language processor to extract stats from text, such was
70
+ word count and more.
70
71
  email:
71
72
  - husseini.mel@gmail.com
72
73
  executables: []
@@ -74,19 +75,28 @@ extensions: []
74
75
  extra_rdoc_files: []
75
76
  files:
76
77
  - ".gitignore"
78
+ - ".hound.yml"
77
79
  - ".rspec"
80
+ - ".ruby-style.yml"
81
+ - ".travis.yml"
78
82
  - ".yardopts"
79
83
  - CHANGELOG.md
80
84
  - Gemfile
81
85
  - LICENSE.txt
82
86
  - README.md
83
87
  - Rakefile
88
+ - lib/refinements/hash_refinements.rb
84
89
  - lib/words_counted.rb
85
90
  - lib/words_counted/counter.rb
91
+ - lib/words_counted/deprecated.rb
92
+ - lib/words_counted/tokeniser.rb
86
93
  - lib/words_counted/version.rb
87
94
  - spec/spec_helper.rb
88
95
  - spec/support/the_hart_and_the_hunter.txt
89
96
  - spec/words_counted/counter_spec.rb
97
+ - spec/words_counted/deprecated_spec.rb
98
+ - spec/words_counted/tokeniser_spec.rb
99
+ - spec/words_counted_spec.rb
90
100
  - words_counted.gemspec
91
101
  homepage: https://github.com/abitdodgy/words_counted
92
102
  licenses:
@@ -108,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
118
  version: '0'
109
119
  requirements: []
110
120
  rubyforge_project:
111
- rubygems_version: 2.2.2
121
+ rubygems_version: 2.4.5
112
122
  signing_key:
113
123
  specification_version: 4
114
124
  summary: See README.
@@ -116,3 +126,6 @@ test_files:
116
126
  - spec/spec_helper.rb
117
127
  - spec/support/the_hart_and_the_hunter.txt
118
128
  - spec/words_counted/counter_spec.rb
129
+ - spec/words_counted/deprecated_spec.rb
130
+ - spec/words_counted/tokeniser_spec.rb
131
+ - spec/words_counted_spec.rb