words_counted 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +92 -67
- data/lib/words_counted/counter.rb +8 -0
- data/lib/words_counted/version.rb +1 -1
- data/spec/words_counted/counter_spec.rb +22 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2d80aa2eb60a01c71a85f3b28b02231496c79ab
|
4
|
+
data.tar.gz: 93cd0029317b142161f4cb3170207fb507b82a50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2fa36912b371084ddb87af00a26db538bcd8836abc34e51a76f5560d5bbb2e133a3dd4b980fe96cc7da07f251ce188b9ca183fda5a08c731e0450572bdf816f0
|
7
|
+
data.tar.gz: b91c95329ad60db96adfda332a9a933de013122660c47b5da85c290ae5dc25cbb98763afae20ee19b595aa4e8e526ee5648db461199e20dbc9df275ea23dc496
|
data/README.md
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
Words Counted is a highly customisable Ruby string analyser. It includes many handy utility methods that go beyond word counting. You can use this gem to get word density, words and the number of times they occur, the highest occurring words, and few more things.
|
4
4
|
|
5
|
-
I use *word* loosely
|
5
|
+
I use *word* loosely since you can pass the program any string you want: words, numbers, characters, etc...
|
6
6
|
|
7
|
-
Pass
|
7
|
+
Pass your own regular expression to customise the criteria for splitting strings. This makes Words Counted very flexible, whether you want to count words, numbers, or special characters.
|
8
8
|
|
9
9
|
### Features
|
10
10
|
|
@@ -18,11 +18,11 @@ Pass in your own regular expression to customise the criteria for splitting stri
|
|
18
18
|
* A hash map of words and their lengths
|
19
19
|
* The longest word(s) and its length
|
20
20
|
* The most occurring word(s) and its number of occurrences.
|
21
|
-
* A flexible way to exclude words (or anything) from the count. You can pass
|
22
|
-
* Customisable criteria. Pass
|
21
|
+
* A flexible way to exclude words (or anything) from the count. You can pass a **string**, a **regexp**, an **array**, or a **lambda**.
|
22
|
+
* Customisable criteria. Pass your own regexp rules to split strings if you prefer. The default regexp has two features:
|
23
23
|
* Filters special characters but respects hyphens and apostrophes.
|
24
24
|
* Plays nicely with diacritics (UTF and unicode characters): "São Paulo" is treated as `["São", "Paulo"]` and not `["S", "", "o", "Paulo"]`.
|
25
|
-
* Pass in a file path instead of a string.
|
25
|
+
* Pass in a file path or a url instead of a string. Words Counted opens and reads files.
|
26
26
|
|
27
27
|
See usage instructions for details on each feature.
|
28
28
|
|
@@ -45,7 +45,9 @@ Or install it yourself as:
|
|
45
45
|
Pass in a string or a file path, and an optional filter and/or regexp.
|
46
46
|
|
47
47
|
```ruby
|
48
|
-
counter = WordsCounted.count(
|
48
|
+
counter = WordsCounted.count(
|
49
|
+
"We are all in the gutter, but some of us are looking at the stars."
|
50
|
+
)
|
49
51
|
|
50
52
|
# Using a file
|
51
53
|
counter = WordsCounted.from_file("path/to/my/file.txt")
|
@@ -63,26 +65,42 @@ counter.word_count #=> 15
|
|
63
65
|
|
64
66
|
#### `.word_occurrences`
|
65
67
|
|
66
|
-
Returns
|
68
|
+
Returns an unsorted hash map of words and their number of occurrences. Uppercase and lowercase words are counted as the same word.
|
67
69
|
|
68
70
|
```ruby
|
69
71
|
counter.word_occurrences
|
70
72
|
|
71
73
|
{
|
72
|
-
"we"
|
73
|
-
"are"
|
74
|
-
"all"
|
75
|
-
"in"
|
76
|
-
"the"
|
77
|
-
"gutter"
|
78
|
-
"but"
|
79
|
-
"some"
|
80
|
-
"of"
|
81
|
-
"us"
|
74
|
+
"we" => 1,
|
75
|
+
"are" => 2,
|
76
|
+
"all" => 1,
|
77
|
+
"in" => 1,
|
78
|
+
"the" => 2,
|
79
|
+
"gutter" => 1,
|
80
|
+
"but" => 1,
|
81
|
+
"some" => 1,
|
82
|
+
"of" => 1,
|
83
|
+
"us" => 1,
|
82
84
|
"looking" => 1,
|
83
|
-
"at"
|
84
|
-
"stars"
|
85
|
-
|
85
|
+
"at" => 1,
|
86
|
+
"stars" => 1
|
87
|
+
}
|
88
|
+
```
|
89
|
+
|
90
|
+
#### `.sorted_word_occurrences`
|
91
|
+
|
92
|
+
Returns a two dimentional array of words and their number of occurrences sorted in descending order. Uppercase and lowercase words are counted as the same word.
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
counter.sorted_word_occurrences
|
96
|
+
|
97
|
+
[
|
98
|
+
["the", 2],
|
99
|
+
["are", 2],
|
100
|
+
["we", 1],
|
101
|
+
# ...
|
102
|
+
["all", 1]
|
103
|
+
]
|
86
104
|
```
|
87
105
|
|
88
106
|
#### `.most_occurring_words`
|
@@ -97,28 +115,44 @@ counter.most_occurring_words
|
|
97
115
|
|
98
116
|
#### `.word_lengths`
|
99
117
|
|
100
|
-
Returns
|
118
|
+
Returns an unsorted hash of words and their lengths.
|
101
119
|
|
102
120
|
```ruby
|
103
121
|
counter.word_lengths
|
104
122
|
|
105
123
|
{
|
106
|
-
"We"
|
107
|
-
"are"
|
108
|
-
"all"
|
109
|
-
"in"
|
110
|
-
"the"
|
111
|
-
"gutter"
|
112
|
-
"but"
|
113
|
-
"some"
|
114
|
-
"of"
|
115
|
-
"us"
|
124
|
+
"We" => 2,
|
125
|
+
"are" => 3,
|
126
|
+
"all" => 3,
|
127
|
+
"in" => 2,
|
128
|
+
"the" => 3,
|
129
|
+
"gutter" => 6,
|
130
|
+
"but" => 3,
|
131
|
+
"some" => 4,
|
132
|
+
"of" => 2,
|
133
|
+
"us" => 2,
|
116
134
|
"looking" => 7,
|
117
|
-
"at"
|
118
|
-
"stars"
|
135
|
+
"at" => 2,
|
136
|
+
"stars" => 5
|
119
137
|
}
|
120
138
|
```
|
121
139
|
|
140
|
+
#### `.sorted_word_lengths`
|
141
|
+
|
142
|
+
Returns a two dimentional array of words and their lengths sorted in descending order.
|
143
|
+
|
144
|
+
```ruby
|
145
|
+
counter.sorted_word_lengths
|
146
|
+
|
147
|
+
[
|
148
|
+
["looking", 7],
|
149
|
+
["gutter", 6],
|
150
|
+
["stars", 5],
|
151
|
+
# ...
|
152
|
+
["in", 2]
|
153
|
+
]
|
154
|
+
```
|
155
|
+
|
122
156
|
#### `.longest_word`
|
123
157
|
|
124
158
|
Returns a two dimensional array of the longest word and its length. In case there is a tie all tied words are returned.
|
@@ -167,8 +201,7 @@ counter.word_density
|
|
167
201
|
Returns the string's character count.
|
168
202
|
|
169
203
|
```ruby
|
170
|
-
counter.char_count
|
171
|
-
#=> 76
|
204
|
+
counter.char_count #=> 76
|
172
205
|
```
|
173
206
|
|
174
207
|
#### `.average_chars_per_word`
|
@@ -176,8 +209,7 @@ counter.char_count
|
|
176
209
|
Returns the average character count per word.
|
177
210
|
|
178
211
|
```ruby
|
179
|
-
counter.average_chars_per_word
|
180
|
-
#=> 4
|
212
|
+
counter.average_chars_per_word #=> 4
|
181
213
|
```
|
182
214
|
|
183
215
|
#### `.unique_word_count`
|
@@ -185,15 +217,14 @@ counter.average_chars_per_word
|
|
185
217
|
Returns the count of unique words in the string.
|
186
218
|
|
187
219
|
```ruby
|
188
|
-
counter.unique_word_count
|
189
|
-
#=> 13
|
220
|
+
counter.unique_word_count #=> 13
|
190
221
|
```
|
191
222
|
|
192
223
|
## Excluding words from the analyser
|
193
224
|
|
194
|
-
You can exclude anything you want from the string you want to analyse by passing in
|
225
|
+
You can exclude anything you want from the string you want to analyse by passing in the `exclude` option. The exclude option accepts a variety of filters.
|
195
226
|
|
196
|
-
1. A *space-delimited* list of candidates. The filter will remove both uppercase and lowercase variants of the candidate
|
227
|
+
1. A *space-delimited* list of candidates. The filter will remove both uppercase and lowercase variants of the candidate when applicable. Useful for excluding *the*, *a*, and so on.
|
197
228
|
2. An array of string candidates. For example: `['a', 'the']`.
|
198
229
|
3. A regular expression.
|
199
230
|
4. A lambda.
|
@@ -223,22 +254,20 @@ counter.words
|
|
223
254
|
|
224
255
|
#### Using a lambda
|
225
256
|
```ruby
|
226
|
-
WordsCounted.count(
|
227
|
-
"1 2 3 4 5 6", regexp: /[0-9]/, exclude: ->(w) { w.to_i.even? }
|
228
|
-
)
|
257
|
+
WordsCounted.count("1 2 3 4 5 6", regexp: /[0-9]/, exclude: ->(w) { w.to_i.even? })
|
229
258
|
counter.words
|
230
259
|
#=> ["1", "3", "5"]
|
231
260
|
```
|
232
261
|
|
233
262
|
## Passing in a Custom Regexp
|
234
263
|
|
235
|
-
Defining words is tricky
|
264
|
+
Defining words is tricky. The default regexp accounts for letters, hyphenated words, and apostrophes. This means *twenty-one* is treated as one word. So is *Mohamad's*.
|
236
265
|
|
237
266
|
```ruby
|
238
267
|
/[\p{Alpha}\-']+/
|
239
268
|
```
|
240
269
|
|
241
|
-
But maybe you don't want to count words
|
270
|
+
But maybe you don't want to count words?–Well, analyse anything you want. What you analyse is only limited by your knowledge of regular expressions. Pass your own criteria as a Ruby regular expression to split your string as desired.
|
242
271
|
|
243
272
|
For example, if you wanted to include numbers in your analysis, you can override the regular expression:
|
244
273
|
|
@@ -250,7 +279,7 @@ counter.words
|
|
250
279
|
|
251
280
|
## Opening and Reading Files
|
252
281
|
|
253
|
-
Use the `from_file` method to open files.
|
282
|
+
Use the `from_file` method to open files. `from_file` accepts the same options as `count`. The file path can be a URL.
|
254
283
|
|
255
284
|
```ruby
|
256
285
|
counter = WordsCounted.from_file("url/or/path/to/file.text")
|
@@ -258,28 +287,28 @@ counter = WordsCounted.from_file("url/or/path/to/file.text")
|
|
258
287
|
|
259
288
|
## Gotchas
|
260
289
|
|
261
|
-
A hyphen used in leu of an *em* or *en* dash will form part of the word
|
290
|
+
A hyphen used in leu of an *em* or *en* dash will form part of the word. This affects the `word_occurences` algorithm.
|
262
291
|
|
263
292
|
```ruby
|
264
293
|
counter = WordsCounted.count("How do you do?-you are well, I see.")
|
265
294
|
counter.word_occurrences
|
266
295
|
|
267
296
|
{
|
268
|
-
"how"
|
269
|
-
"do"
|
270
|
-
"you"
|
271
|
-
"-you"
|
272
|
-
"are"
|
273
|
-
"very"
|
274
|
-
"well"
|
275
|
-
"i"
|
276
|
-
"see"
|
297
|
+
"how" => 1,
|
298
|
+
"do" => 2,
|
299
|
+
"you" => 1,
|
300
|
+
"-you" => 1, # WTF, mate!
|
301
|
+
"are" => 1,
|
302
|
+
"very" => 1,
|
303
|
+
"well" => 1,
|
304
|
+
"i" => 1,
|
305
|
+
"see" => 1
|
277
306
|
}
|
278
307
|
```
|
279
308
|
|
280
|
-
In this example
|
309
|
+
In this example `-you` and `you` are counted as separate words. Writers should use the correct dash element, but this is not always true.
|
281
310
|
|
282
|
-
Another gotcha is that the default criteria does not include numbers in its analysis. Remember that you can pass
|
311
|
+
Another gotcha is that the default criteria does not include numbers in its analysis. Remember that you can pass your own regular expression if the default behaviour does not fit your needs.
|
283
312
|
|
284
313
|
## Road Map
|
285
314
|
|
@@ -288,15 +317,11 @@ Another gotcha is that the default criteria does not include numbers in its anal
|
|
288
317
|
|
289
318
|
#### Ability to open URLs
|
290
319
|
|
291
|
-
|
320
|
+
Something like...
|
292
321
|
|
293
322
|
```ruby
|
294
323
|
def self.count_from_url
|
295
|
-
|
296
|
-
end
|
297
|
-
|
298
|
-
def self.from_file
|
299
|
-
new # open file and send string here.
|
324
|
+
# open url and send string here after removing html
|
300
325
|
end
|
301
326
|
```
|
302
327
|
|
@@ -314,7 +339,7 @@ Originally I wrote this program for a code challenge on Treehouse. You can find
|
|
314
339
|
|
315
340
|
Thanks to Dave Yarwood for helping me improve my code. Some of my code is based on his recommendations. You can find the original program implementation, as well as Dave's code review, on [Code Review][1].
|
316
341
|
|
317
|
-
Thanks to [Wayne Conrad][2] for providing [an excellent code review][3], and improving the filter feature well beyond what I can come up with.
|
342
|
+
Thanks to [Wayne Conrad][2] for providing [an excellent code review][3], and improving the filter feature to well beyond what I can come up with.
|
318
343
|
|
319
344
|
## Contributing
|
320
345
|
|
@@ -327,4 +352,4 @@ Thanks to [Wayne Conrad][2] for providing [an excellent code review][3], and imp
|
|
327
352
|
|
328
353
|
[1]: http://codereview.stackexchange.com/questions/46105/a-ruby-string-analyser
|
329
354
|
[2]: https://github.com/wconrad
|
330
|
-
[3]: http://codereview.stackexchange.com/a/
|
355
|
+
[3]: http://codereview.stackexchange.com/a/49476/1563
|
@@ -41,6 +41,14 @@ module WordsCounted
|
|
41
41
|
end.sort_by { |_, value| value }.reverse
|
42
42
|
end
|
43
43
|
|
44
|
+
def sorted_word_occurrences
|
45
|
+
word_occurrences.sort_by { |_, v| v }.reverse
|
46
|
+
end
|
47
|
+
|
48
|
+
def sorted_word_lengths
|
49
|
+
word_lengths.sort_by { |_, v| v }.reverse
|
50
|
+
end
|
51
|
+
|
44
52
|
private
|
45
53
|
|
46
54
|
def highest_ranking(entries)
|
@@ -114,6 +114,17 @@ module WordsCounted
|
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
|
+
describe "sorted_word_occurrences" do
|
118
|
+
it "returns an array" do
|
119
|
+
expect(counter.sorted_word_occurrences).to be_a(Array)
|
120
|
+
end
|
121
|
+
|
122
|
+
it "returns a two dimensional array sorted by descending word occurrence" do
|
123
|
+
counter = Counter.new("Blue, green, green, green, orange, green, orange, red, orange, red")
|
124
|
+
expect(counter.sorted_word_occurrences).to eq([ ["green", 4], ["orange", 3], ["red", 2], ["blue", 1] ])
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
117
128
|
describe "most_occurring_words" do
|
118
129
|
it "returns an array" do
|
119
130
|
expect(counter.most_occurring_words).to be_a(Array)
|
@@ -136,6 +147,17 @@ module WordsCounted
|
|
136
147
|
end
|
137
148
|
end
|
138
149
|
|
150
|
+
describe "sorted_word_lengths" do
|
151
|
+
it "returns an array" do
|
152
|
+
expect(counter.sorted_word_lengths).to be_a(Array)
|
153
|
+
end
|
154
|
+
|
155
|
+
it "returns a two dimensional array sorted by descending word length" do
|
156
|
+
counter = Counter.new("I am not certain of that")
|
157
|
+
expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["I", 1] ])
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
139
161
|
describe "longest_words" do
|
140
162
|
it "returns an array" do
|
141
163
|
expect(counter.longest_words).to be_a(Array)
|