words_counted 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +134 -102
- data/lib/words_counted/counter.rb +24 -10
- data/lib/words_counted/version.rb +1 -1
- data/spec/words_counted/counter_spec.rb +46 -8
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3043d1270f35c595ab088f6278749274daf60d3
|
4
|
+
data.tar.gz: 46e566b6a0fc584393ed066d4827cf7b18a4dc0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05b360658c8cf57b1c117afde1a36623b5172a0ef2c53bfb39a8a4d1a7a3cf634896b5176ebf8f68aa0fb1e4afb84350a27805d5906a7524c84404240c54d097
|
7
|
+
data.tar.gz: 6bd402a6df332407b2b333d4c859980986bf61df46a9a7f5dac484c65fcddc49a6d60f0f0e54c51d0f70b1ecfc6c1b58a61d095887a766ad0dcf97e9a9876dc3
|
data/README.md
CHANGED
@@ -1,22 +1,27 @@
|
|
1
1
|
# Words Counted
|
2
2
|
|
3
|
-
Words Counted is a
|
3
|
+
Words Counted is a highly customisable Ruby string analyser. It includes some handy utility methods that go beyond word counting. You can use this gem to get word density, words and their number of occurrences, the highest occurring words, and few more things.
|
4
4
|
|
5
|
-
|
5
|
+
I use the word *word* loosely here, since you can pass the program any string you want: words, numbers, characters, etc...
|
6
|
+
|
7
|
+
You can pass in your custom criteria for splitting strings in the form of a custom regular expression. This affords you a great deal of flexibility, whether you want to count words, numbers, or special characters.
|
6
8
|
|
7
9
|
### Features
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
11
|
+
* Get the following data from any string:
|
12
|
+
* Word count
|
13
|
+
* Unique word count
|
14
|
+
* Word density
|
15
|
+
* Character count
|
16
|
+
* Average characters per word
|
17
|
+
* A hash map of words and the number of times they occur
|
18
|
+
* A hash map of words and their lengths
|
19
|
+
* The longest word(s) and its length
|
20
|
+
* The most occurring word(s) and its number of occurrences.
|
21
|
+
* A flexible way to exclude words (or anything) from the count. You can pass in a **string**, a **regexp**, an **array**, or a **lambda**.
|
22
|
+
* Filters special characters but respects hyphens and apostrophes.
|
23
|
+
* Plays nicely with diacritics (UTF and unicode characters): "São Paulo" is treated as `["São", "Paulo"]` and not `["S", "", "o", "Paulo"]`.
|
24
|
+
* Customisable criteria. Pass in your own regexp rules to split strings if you prefer.
|
20
25
|
|
21
26
|
See usage instructions for details on each feature.
|
22
27
|
|
@@ -48,7 +53,7 @@ counter = WordsCounted::Counter.new(
|
|
48
53
|
|
49
54
|
#### `.word_count`
|
50
55
|
|
51
|
-
Returns the word count of a given string. The word count includes only alpha characters. Hyphenated and words with apostrophes are considered a single word. You can pass in your own
|
56
|
+
Returns the word count of a given string. The word count includes only alpha characters. Hyphenated and words with apostrophes are considered a single word. You can pass in your own regular expression if this is not desired behaviour.
|
52
57
|
|
53
58
|
```ruby
|
54
59
|
counter.word_count #=> 15
|
@@ -60,23 +65,22 @@ Returns a hash map of words and their number of occurrences. Uppercase and lower
|
|
60
65
|
|
61
66
|
```ruby
|
62
67
|
counter.word_occurrences
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
#
|
68
|
+
|
69
|
+
{
|
70
|
+
"we" => 1,
|
71
|
+
"are" => 2,
|
72
|
+
"all" => 1,
|
73
|
+
"in" => 1,
|
74
|
+
"the" => 2,
|
75
|
+
"gutter" => 1,
|
76
|
+
"but" => 1,
|
77
|
+
"some" => 1,
|
78
|
+
"of" => 1,
|
79
|
+
"us" => 1,
|
80
|
+
"looking" => 1,
|
81
|
+
"at" => 1,
|
82
|
+
"stars" => 1
|
83
|
+
}
|
80
84
|
```
|
81
85
|
|
82
86
|
#### `.most_occurring_words`
|
@@ -85,12 +89,8 @@ Returns a two dimensional array of the most occurring word and its number of occ
|
|
85
89
|
|
86
90
|
```ruby
|
87
91
|
counter.most_occurring_words
|
88
|
-
|
89
|
-
|
90
|
-
# ["are", 2],
|
91
|
-
# ["the", 2]
|
92
|
-
# ]
|
93
|
-
#
|
92
|
+
|
93
|
+
[ ["are", 2], ["the", 2] ]
|
94
94
|
```
|
95
95
|
|
96
96
|
#### `.word_lengths`
|
@@ -99,23 +99,22 @@ Returns a hash of words and their lengths.
|
|
99
99
|
|
100
100
|
```ruby
|
101
101
|
counter.word_lengths
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
#
|
102
|
+
|
103
|
+
{
|
104
|
+
"We" => 2,
|
105
|
+
"are" => 3,
|
106
|
+
"all" => 3,
|
107
|
+
"in" => 2,
|
108
|
+
"the" => 3,
|
109
|
+
"gutter" => 6,
|
110
|
+
"but" => 3,
|
111
|
+
"some" => 4,
|
112
|
+
"of" => 2,
|
113
|
+
"us" => 2,
|
114
|
+
"looking" => 7,
|
115
|
+
"at" => 2,
|
116
|
+
"stars" => 5
|
117
|
+
}
|
119
118
|
```
|
120
119
|
|
121
120
|
#### `.longest_word`
|
@@ -124,11 +123,8 @@ Returns a two dimensional array of the longest word and its length. In case ther
|
|
124
123
|
|
125
124
|
```ruby
|
126
125
|
counter.longest_words
|
127
|
-
|
128
|
-
|
129
|
-
# ["looking", 7]
|
130
|
-
# ]
|
131
|
-
#
|
126
|
+
|
127
|
+
[ ["looking", 7] ]
|
132
128
|
```
|
133
129
|
|
134
130
|
#### `.words`
|
@@ -146,23 +142,22 @@ Returns a two-dimentional array of words and their density.
|
|
146
142
|
|
147
143
|
```ruby
|
148
144
|
counter.word_density
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
#
|
145
|
+
|
146
|
+
[
|
147
|
+
["are", 13.33],
|
148
|
+
["the", 13.33],
|
149
|
+
["but", 6.67],
|
150
|
+
["us", 6.67],
|
151
|
+
["of", 6.67],
|
152
|
+
["some", 6.67],
|
153
|
+
["looking", 6.67],
|
154
|
+
["gutter", 6.67],
|
155
|
+
["at", 6.67],
|
156
|
+
["in", 6.67],
|
157
|
+
["all", 6.67],
|
158
|
+
["stars", 6.67],
|
159
|
+
["we", 6.67]
|
160
|
+
]
|
166
161
|
```
|
167
162
|
|
168
163
|
#### `.char_count`
|
@@ -192,34 +187,63 @@ counter.unique_word_count
|
|
192
187
|
#=> 13
|
193
188
|
```
|
194
189
|
|
195
|
-
##
|
190
|
+
## Excluding words from the analyser
|
191
|
+
|
192
|
+
You can exclude anything you want from the string you want to analyse by passing in an `exclude` option. The exclude option accepts a variety of filters.
|
196
193
|
|
197
|
-
|
194
|
+
1. A *space-delimited* list of candidates. The filter will remove both uppercase and lowercase variants of the candidate, when applicable. Useful for excluding *the*, *a*, and so on.
|
195
|
+
2. An array of string candidates. For example: `['a', 'the']`.
|
196
|
+
3. A regular expression.
|
197
|
+
4. A lambda.
|
198
198
|
|
199
|
+
#### Using a string
|
199
200
|
```ruby
|
200
201
|
WordsCounted::Counter.new(
|
201
|
-
"Magnificent! That was magnificent, Trevor.",
|
202
|
+
"Magnificent! That was magnificent, Trevor.", exclude: "was magnificent"
|
202
203
|
)
|
203
204
|
counter.words
|
204
205
|
#=> ["That", "Trevor"]
|
205
206
|
```
|
206
207
|
|
208
|
+
#### Using an array
|
209
|
+
```ruby
|
210
|
+
WordsCounted::Counter.new("1 2 3 4 5 6", regexp: /[0-9]/, exclude: ['1', '2', '3'])
|
211
|
+
counter.words
|
212
|
+
#=> ["4", "5", "6"]
|
213
|
+
```
|
214
|
+
|
215
|
+
#### Using a regular expression
|
216
|
+
```ruby
|
217
|
+
WordsCounted::Counter.new("Hello Beirut", exclude: /Beirut/)
|
218
|
+
counter.words
|
219
|
+
#=> ["Hello"]
|
220
|
+
```
|
221
|
+
|
222
|
+
#### Using a lambda
|
223
|
+
```ruby
|
224
|
+
WordsCounted::Counter.new(
|
225
|
+
"1 2 3 4 5 6", regexp: /[0-9]/, exclude: ->(w) { w.to_i.even? }
|
226
|
+
)
|
227
|
+
counter.words
|
228
|
+
#=> ["1", "3", "5"]
|
229
|
+
```
|
230
|
+
|
207
231
|
## Passing in a Custom Regexp
|
208
232
|
|
209
|
-
Defining words is tricky business. Out of the box, the default regexp accounts for letters, hyphenated words, and apostrophes. This means
|
233
|
+
Defining words is tricky business. Out of the box, the default regexp accounts for letters, hyphenated words, and apostrophes. This means *twenty-one* is treated as one word. So is *Mohamad's*.
|
210
234
|
|
211
235
|
```ruby
|
212
236
|
/[\p{Alpha}\-']+/
|
213
237
|
```
|
214
238
|
|
215
|
-
But maybe you don't want to count words? Well, count anything you want. What you count is only limited by your knowledge of regular expressions. Pass in your own criteria in the form of a Ruby
|
239
|
+
But maybe you don't want to count words? Well, count anything you want. What you count is only limited by your knowledge of regular expressions. Pass in your own criteria in the form of a Ruby regular expression to split your string as desired.
|
216
240
|
|
217
|
-
For example, if you wanted to
|
241
|
+
For example, if you wanted to include numbers in your analysis, you can override the regular expression:
|
218
242
|
|
219
243
|
```ruby
|
220
|
-
counter = WordsCounted::Counter.new("
|
244
|
+
counter = WordsCounted::Counter.new("Numbers 1, 2, and 3", regexp: /[\p{Alnum}\-']+/)
|
221
245
|
counter.words
|
222
|
-
#=> ["
|
246
|
+
#=> ["Numbers", "1", "2", "and", "3"]
|
223
247
|
```
|
224
248
|
|
225
249
|
## Gotchas
|
@@ -228,34 +252,31 @@ A hyphen used in leu of an *em* or *en* dash will form part of the word and thro
|
|
228
252
|
|
229
253
|
```ruby
|
230
254
|
counter = WordsCounted::Counter.new("How do you do?-you are well, I see.")
|
231
|
-
#<WordsCounted::Counter:0x007fd494252518 @words=["How", "do", "you", "do", "-you", "are", "well", "I", "see"]>
|
232
|
-
|
233
255
|
counter.word_occurrences
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
#
|
256
|
+
|
257
|
+
{
|
258
|
+
"how" => 1,
|
259
|
+
"do" => 2,
|
260
|
+
"you" => 1,
|
261
|
+
"-you" => 1, # WTF, mate!
|
262
|
+
"are" => 1,
|
263
|
+
"very" => 1,
|
264
|
+
"well" => 1,
|
265
|
+
"i" => 1,
|
266
|
+
"see" => 1
|
267
|
+
}
|
247
268
|
```
|
248
269
|
|
249
270
|
In this example, `-you` and `you` are counted as separate words. Writers should use the correct dash element, but this is not always the case.
|
250
271
|
|
251
|
-
Another gotcha is that the default criteria does not
|
272
|
+
Another gotcha is that the default criteria does not include numbers in its analysis. Remember that you can pass in your own regular expression if the default behaviour does not fit your needs.
|
252
273
|
|
253
274
|
## Road Map
|
254
275
|
|
255
276
|
1. Add ability to open files or URLs.
|
256
277
|
2. Add paragraph, sentence, average words per sentence, and average sentence chars counters.
|
257
278
|
|
258
|
-
#### Ability to open files
|
279
|
+
#### Ability to open files and URLs
|
259
280
|
|
260
281
|
Maybe I can some class methods to open the file and init the counter class.
|
261
282
|
|
@@ -277,7 +298,13 @@ end
|
|
277
298
|
|
278
299
|
## About
|
279
300
|
|
280
|
-
Originally I wrote this program for a code challenge
|
301
|
+
Originally I wrote this program for a code challenge on Treehouse. You can find the original implementation on [Code Review][1].
|
302
|
+
|
303
|
+
## Contributers
|
304
|
+
|
305
|
+
Thanks to Dave Yarwood for helping me improve my code. Some of my code is based on his recommendations. You can find the original program implementation, as well as Dave's code review, on [Code Review][1].
|
306
|
+
|
307
|
+
Thanks to [Wayne Conrad][2] for providing [an excellent code review][3], and improving the filter feature well beyond what I can come up with.
|
281
308
|
|
282
309
|
## Contributing
|
283
310
|
|
@@ -286,3 +313,8 @@ Originally I wrote this program for a code challenge. My initial implementation
|
|
286
313
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
287
314
|
4. Push to the branch (`git push origin my-new-feature`)
|
288
315
|
5. Create new Pull Request
|
316
|
+
|
317
|
+
|
318
|
+
[1]: http://codereview.stackexchange.com/questions/46105/a-ruby-string-analyser
|
319
|
+
[2]: https://github.com/wconrad
|
320
|
+
[3]: http://codereview.stackexchange.com/a/47515/1563
|
@@ -2,12 +2,13 @@ module WordsCounted
|
|
2
2
|
class Counter
|
3
3
|
attr_reader :words, :word_occurrences, :word_lengths, :char_count
|
4
4
|
|
5
|
-
|
5
|
+
WORD_REGEXP = /[\p{Alpha}\-']+/
|
6
6
|
|
7
7
|
def initialize(string, options = {})
|
8
8
|
@options = options
|
9
|
-
|
10
|
-
@words = string.scan(
|
9
|
+
exclude = filter_proc(options[:exclude])
|
10
|
+
@words = string.scan(regexp).reject { |word| exclude.call(word) }
|
11
|
+
@char_count = @words.join.size
|
11
12
|
@word_occurrences = words.each_with_object(Hash.new(0)) do |word, hash|
|
12
13
|
hash[word.downcase] += 1
|
13
14
|
end
|
@@ -40,7 +41,7 @@ module WordsCounted
|
|
40
41
|
end.sort_by { |_, value| value }.reverse
|
41
42
|
end
|
42
43
|
|
43
|
-
|
44
|
+
private
|
44
45
|
|
45
46
|
def highest_ranking(entries)
|
46
47
|
entries.group_by { |word, value| value }.sort.last.last
|
@@ -50,15 +51,28 @@ module WordsCounted
|
|
50
51
|
(n.to_f / word_count.to_f * 100.0).round(2)
|
51
52
|
end
|
52
53
|
|
53
|
-
def
|
54
|
-
@options[:
|
54
|
+
def regexp
|
55
|
+
@options[:regexp] || WORD_REGEXP
|
55
56
|
end
|
56
57
|
|
57
|
-
def filter
|
58
|
-
if
|
59
|
-
|
58
|
+
def filter_proc(filter)
|
59
|
+
if filter.respond_to?(:to_a)
|
60
|
+
filter_procs = Array(filter).map(&method(:filter_proc))
|
61
|
+
->(word) {
|
62
|
+
filter_procs.any? { |p| p.call(word) }
|
63
|
+
}
|
64
|
+
elsif filter.respond_to?(:to_str)
|
65
|
+
exclusion_list = filter.split.collect(&:downcase)
|
66
|
+
->(w) {
|
67
|
+
exclusion_list.include?(w.downcase)
|
68
|
+
}
|
69
|
+
elsif Regexp.try_convert(filter)
|
70
|
+
filter = Regexp.try_convert(filter)
|
71
|
+
Proc.new { |w| w =~ filter }
|
72
|
+
elsif filter.respond_to?(:to_proc)
|
73
|
+
filter.to_proc
|
60
74
|
else
|
61
|
-
|
75
|
+
raise ArgumentError, "Incorrect filter type"
|
62
76
|
end
|
63
77
|
end
|
64
78
|
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative "../spec_helper"
|
2
3
|
|
3
4
|
module WordsCounted
|
4
5
|
describe Counter do
|
@@ -55,20 +56,45 @@ module WordsCounted
|
|
55
56
|
expect(counter.words).to eq(%w[São Paulo])
|
56
57
|
end
|
57
58
|
|
58
|
-
it "
|
59
|
-
counter = Counter.new("That was magnificent, Trevor.",
|
59
|
+
it "it accepts a string filter" do
|
60
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
|
60
61
|
expect(counter.words).to eq(%w[That was Trevor])
|
61
62
|
end
|
62
63
|
|
63
|
-
it "
|
64
|
-
counter = Counter.new("That was magnificent, Trevor.",
|
64
|
+
it "it accepts a string filter with multiple words" do
|
65
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
|
66
|
+
expect(counter.words).to eq(%w[That Trevor])
|
67
|
+
end
|
68
|
+
|
69
|
+
it "filters words in uppercase when using a string filter" do
|
70
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
|
71
|
+
expect(counter.words).to eq(%w[That was Trevor])
|
72
|
+
end
|
73
|
+
|
74
|
+
it "accepts a regexp filter" do
|
75
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
|
65
76
|
expect(counter.words).to eq(%w[That was Trevor])
|
66
77
|
end
|
67
78
|
|
68
|
-
it "
|
69
|
-
counter = Counter.new("
|
79
|
+
it "accepts an array filter" do
|
80
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
|
81
|
+
expect(counter.words).to eq(%w[magnificent Trevor])
|
82
|
+
end
|
83
|
+
|
84
|
+
it "accepts a lambda filter" do
|
85
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) {w == 'That'})
|
86
|
+
expect(counter.words).to eq(%w[was magnificent Trevor])
|
87
|
+
end
|
88
|
+
|
89
|
+
it "accepts a custom regexp" do
|
90
|
+
counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
|
70
91
|
expect(counter.words).to eq(["I", "am", "007"])
|
71
92
|
end
|
93
|
+
|
94
|
+
it "char_count should be calculated after the filter is applied" do
|
95
|
+
counter = Counter.new("I am Legend.", exclude: "I am")
|
96
|
+
expect(counter.char_count).to eq(6)
|
97
|
+
end
|
72
98
|
end
|
73
99
|
|
74
100
|
describe ".word_count" do
|
@@ -134,14 +160,26 @@ module WordsCounted
|
|
134
160
|
|
135
161
|
describe ".char_count" do
|
136
162
|
it "returns the number of chars in the passed in string" do
|
137
|
-
|
163
|
+
counter = Counter.new("His name was major, Major Major Major Major.")
|
164
|
+
expect(counter.char_count).to eq(35)
|
165
|
+
end
|
166
|
+
|
167
|
+
it "returns the number of chars in the passed in string after the filter is applied" do
|
168
|
+
counter = Counter.new("His name was major, Major Major Major Major.", exclude: "Major")
|
169
|
+
expect(counter.char_count).to eq(10)
|
138
170
|
end
|
139
171
|
end
|
140
172
|
|
141
173
|
describe ".average_chars_per_word" do
|
142
174
|
it "returns the average number of chars per word" do
|
175
|
+
counter = Counter.new("His name was major, Major Major Major Major.")
|
143
176
|
expect(counter.average_chars_per_word).to eq(4)
|
144
177
|
end
|
178
|
+
|
179
|
+
it "returns the average number of chars per word after the filter is applied" do
|
180
|
+
counter = Counter.new("His name was major, Major Major Major Major.", exclude: "Major")
|
181
|
+
expect(counter.average_chars_per_word).to eq(3)
|
182
|
+
end
|
145
183
|
end
|
146
184
|
|
147
185
|
describe ".unique_word_count" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words_counted
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mohamad El-Husseini
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -106,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
106
|
version: '0'
|
107
107
|
requirements: []
|
108
108
|
rubyforge_project:
|
109
|
-
rubygems_version: 2.
|
109
|
+
rubygems_version: 2.2.2
|
110
110
|
signing_key:
|
111
111
|
specification_version: 4
|
112
112
|
summary: See README.
|