words_counted 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +59 -40
- data/lib/words_counted/counter.rb +9 -5
- data/lib/words_counted/version.rb +1 -1
- data/spec/words_counted/counter_spec.rb +28 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0992a863d31573f13f2994d914701c22573edb2e
|
4
|
+
data.tar.gz: 83b5b5ca60aa6663321be5a24a791f829f9a0c23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1363df462e354e03a825db08fb2eebb877fd656a817061ea1eb3777fc95710a8276fb6dd109f720e8826b2f64acbe1bd855190cab5961914a7f59aed534e7f5
|
7
|
+
data.tar.gz: 8f238c41e476c485721da16e207cdc739ccdefb44c8fdb26693976eece3fcafceefd0ffcd11af744383248d4266009d08754e92af93d40866bd267f7ef40d52d
|
data/README.md
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
-
#
|
1
|
+
# WordsCounted
|
2
2
|
|
3
|
-
|
3
|
+
WordsCounted is a highly customisable Ruby string analyser. It includes many handy utility methods that go beyond word counting. You can use this gem to get word density, words and the number of times they occur, the highest occurring words, and few more things.
|
4
4
|
|
5
5
|
I use *word* loosely since you can pass the program any string you want: words, numbers, characters, etc...
|
6
6
|
|
7
|
-
Pass your own regular expression to customise the criteria for splitting strings. This makes
|
7
|
+
Pass your own regular expression to customise the criteria for splitting strings. This makes WordsCounted very flexible, whether you want to count words, numbers, or special characters.
|
8
|
+
|
9
|
+
### Demo
|
10
|
+
|
11
|
+
Visit [the gem's website][4] for a demo.
|
8
12
|
|
9
13
|
### Features
|
10
14
|
|
@@ -22,7 +26,7 @@ Pass your own regular expression to customise the criteria for splitting strings
|
|
22
26
|
* Customisable criteria. Pass your own regexp rules to split strings if you prefer. The default regexp has two features:
|
23
27
|
* Filters special characters but respects hyphens and apostrophes.
|
24
28
|
* Plays nicely with diacritics (UTF and unicode characters): "São Paulo" is treated as `["São", "Paulo"]` and not `["S", "", "o", "Paulo"]`.
|
25
|
-
* Pass in a file path or a url instead of a string.
|
29
|
+
* Pass in a file path or a url instead of a string. WordsCounted opens and reads files.
|
26
30
|
|
27
31
|
See usage instructions for details on each feature.
|
28
32
|
|
@@ -40,7 +44,7 @@ Or install it yourself as:
|
|
40
44
|
|
41
45
|
$ gem install words_counted
|
42
46
|
|
43
|
-
##
|
47
|
+
## Quick usage
|
44
48
|
|
45
49
|
Pass in a string or a file path, and an optional filter and/or regexp.
|
46
50
|
|
@@ -53,7 +57,31 @@ counter = WordsCounted.count(
|
|
53
57
|
counter = WordsCounted.from_file("path/or/url/to/my/file.txt")
|
54
58
|
```
|
55
59
|
|
56
|
-
|
60
|
+
## API
|
61
|
+
|
62
|
+
### Class methods
|
63
|
+
|
64
|
+
#### `count(string, options = {})`
|
65
|
+
|
66
|
+
Initializes an analyser object.
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
counter = WordsCounted.count("Hello Beirut!")
|
70
|
+
````
|
71
|
+
|
72
|
+
Accepts two options: `exclude` and `regexp`. See [Excluding words from the analyser][5] and [Passing in a custom regexp][6] respectively.
|
73
|
+
|
74
|
+
#### from_file(path, options = {})
|
75
|
+
|
76
|
+
Initializes an analyser object from a file path.
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
counter = WordsCounted.count("Hello Beirut!")
|
80
|
+
````
|
81
|
+
|
82
|
+
Accepts the same options as `count()`.
|
83
|
+
|
84
|
+
### Instance methods
|
57
85
|
|
58
86
|
#### `.word_count`
|
59
87
|
|
@@ -74,22 +102,14 @@ counter.word_occurrences
|
|
74
102
|
"we" => 1,
|
75
103
|
"are" => 2,
|
76
104
|
"all" => 1,
|
77
|
-
|
78
|
-
"the" => 2,
|
79
|
-
"gutter" => 1,
|
80
|
-
"but" => 1,
|
81
|
-
"some" => 1,
|
82
|
-
"of" => 1,
|
83
|
-
"us" => 1,
|
84
|
-
"looking" => 1,
|
85
|
-
"at" => 1,
|
105
|
+
# ...
|
86
106
|
"stars" => 1
|
87
107
|
}
|
88
108
|
```
|
89
109
|
|
90
110
|
#### `.sorted_word_occurrences`
|
91
111
|
|
92
|
-
Returns a two
|
112
|
+
Returns a two dimensional array of words and their number of occurrences sorted in descending order. Uppercase and lowercase words are counted as the same word.
|
93
113
|
|
94
114
|
```ruby
|
95
115
|
counter.sorted_word_occurrences
|
@@ -124,22 +144,14 @@ counter.word_lengths
|
|
124
144
|
"We" => 2,
|
125
145
|
"are" => 3,
|
126
146
|
"all" => 3,
|
127
|
-
|
128
|
-
"the" => 3,
|
129
|
-
"gutter" => 6,
|
130
|
-
"but" => 3,
|
131
|
-
"some" => 4,
|
132
|
-
"of" => 2,
|
133
|
-
"us" => 2,
|
134
|
-
"looking" => 7,
|
135
|
-
"at" => 2,
|
147
|
+
# ...
|
136
148
|
"stars" => 5
|
137
149
|
}
|
138
150
|
```
|
139
151
|
|
140
152
|
#### `.sorted_word_lengths`
|
141
153
|
|
142
|
-
Returns a two
|
154
|
+
Returns a two dimensional array of words and their lengths sorted in descending order.
|
143
155
|
|
144
156
|
```ruby
|
145
157
|
counter.sorted_word_lengths
|
@@ -174,7 +186,7 @@ counter.words
|
|
174
186
|
|
175
187
|
#### `.word_density([ precision = 2 ])`
|
176
188
|
|
177
|
-
Returns a two-
|
189
|
+
Returns a two-dimensional array of words and their density to a precision of two. It accepts a precision argument which defaults to two.
|
178
190
|
|
179
191
|
```ruby
|
180
192
|
counter.word_density
|
@@ -183,15 +195,7 @@ counter.word_density
|
|
183
195
|
["are", 13.33],
|
184
196
|
["the", 13.33],
|
185
197
|
["but", 6.67 ],
|
186
|
-
|
187
|
-
["of", 6.67 ],
|
188
|
-
["some", 6.67 ],
|
189
|
-
["looking", 6.67 ],
|
190
|
-
["gutter", 6.67 ],
|
191
|
-
["at", 6.67 ],
|
192
|
-
["in", 6.67 ],
|
193
|
-
["all", 6.67 ],
|
194
|
-
["stars", 6.67 ],
|
198
|
+
# ...
|
195
199
|
["we", 6.67 ]
|
196
200
|
]
|
197
201
|
```
|
@@ -214,12 +218,20 @@ counter.average_chars_per_word #=> 4
|
|
214
218
|
|
215
219
|
#### `.unique_word_count`
|
216
220
|
|
217
|
-
Returns the count of unique words in the string.
|
221
|
+
Returns the count of unique words in the string. This is case insensitive.
|
218
222
|
|
219
223
|
```ruby
|
220
224
|
counter.unique_word_count #=> 13
|
221
225
|
```
|
222
226
|
|
227
|
+
#### `.count(word)`
|
228
|
+
|
229
|
+
Counts the occurrence of a word in the string.
|
230
|
+
|
231
|
+
```ruby
|
232
|
+
counter.count("are") #=> 2
|
233
|
+
```
|
234
|
+
|
223
235
|
## Excluding words from the analyser
|
224
236
|
|
225
237
|
You can exclude anything you want from the string you want to analyse by passing in the `exclude` option. The exclude option accepts a variety of filters.
|
@@ -310,17 +322,21 @@ In this example `-you` and `you` are counted as separate words. Writers should u
|
|
310
322
|
|
311
323
|
Another gotcha is that the default criteria does not include numbers in its analysis. Remember that you can pass your own regular expression if the default behaviour does not fit your needs.
|
312
324
|
|
325
|
+
### A note on case sensitivity
|
326
|
+
|
327
|
+
The program will downcase all incoming strings for consistency.
|
328
|
+
|
313
329
|
## Road Map
|
314
330
|
|
315
331
|
1. Add ability to open URLs.
|
316
332
|
2. Add paragraph, sentence, average words per sentence, and average sentence chars counters.
|
317
333
|
|
318
|
-
#### Ability to
|
334
|
+
#### Ability to read URLs
|
319
335
|
|
320
336
|
Something like...
|
321
337
|
|
322
338
|
```ruby
|
323
|
-
def self.
|
339
|
+
def self.from_url
|
324
340
|
# open url and send string here after removing html
|
325
341
|
end
|
326
342
|
```
|
@@ -335,7 +351,7 @@ end
|
|
335
351
|
|
336
352
|
Originally I wrote this program for a code challenge on Treehouse. You can find the original implementation on [Code Review][1].
|
337
353
|
|
338
|
-
##
|
354
|
+
## Contributors
|
339
355
|
|
340
356
|
Thanks to Dave Yarwood for helping me improve my code. Some of my code is based on his recommendations. You can find the original program implementation, as well as Dave's code review, on [Code Review][1].
|
341
357
|
|
@@ -353,3 +369,6 @@ Thanks to [Wayne Conrad][2] for providing [an excellent code review][3], and imp
|
|
353
369
|
[1]: http://codereview.stackexchange.com/questions/46105/a-ruby-string-analyser
|
354
370
|
[2]: https://github.com/wconrad
|
355
371
|
[3]: http://codereview.stackexchange.com/a/49476/1563
|
372
|
+
[4]: http://rubywordcount.com
|
373
|
+
[5]: https://github.com/abitdodgy/words_counted#excluding-words-from-the-analyser
|
374
|
+
[6]: https://github.com/abitdodgy/words_counted#passing-in-a-custom-regexp
|
@@ -13,9 +13,9 @@ module WordsCounted
|
|
13
13
|
def initialize(string, options = {})
|
14
14
|
@options = options
|
15
15
|
exclude = filter_proc(options[:exclude])
|
16
|
-
@words = string.scan(regexp).reject { |word| exclude.call(word) }
|
17
|
-
@char_count =
|
18
|
-
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word
|
16
|
+
@words = string.scan(regexp).map(&:downcase).reject { |word| exclude.call(word) }
|
17
|
+
@char_count = words.join.size
|
18
|
+
@word_occurrences = words.each_with_object(Hash.new(0)) { |word, hash| hash[word] += 1 }
|
19
19
|
@word_lengths = words.each_with_object({}) { |word, hash| hash[word] ||= word.length }
|
20
20
|
end
|
21
21
|
|
@@ -54,6 +54,10 @@ module WordsCounted
|
|
54
54
|
sort_by_descending_value word_lengths
|
55
55
|
end
|
56
56
|
|
57
|
+
def count(match)
|
58
|
+
words.select { |word| word == match.downcase }.size
|
59
|
+
end
|
60
|
+
|
57
61
|
private
|
58
62
|
|
59
63
|
def highest_ranking(entries)
|
@@ -77,14 +81,14 @@ module WordsCounted
|
|
77
81
|
elsif filter.respond_to?(:to_str)
|
78
82
|
exclusion_list = filter.split.collect(&:downcase)
|
79
83
|
->(word) {
|
80
|
-
exclusion_list.include?(word
|
84
|
+
exclusion_list.include?(word)
|
81
85
|
}
|
82
86
|
elsif regexp_filter = Regexp.try_convert(filter)
|
83
87
|
Proc.new { |word| word =~ regexp_filter }
|
84
88
|
elsif filter.respond_to?(:to_proc)
|
85
89
|
filter.to_proc
|
86
90
|
else
|
87
|
-
raise ArgumentError, "Filter must String, Array, Lambda, or Regexp"
|
91
|
+
raise ArgumentError, "Filter must String, Array, Lambda, or a Regexp"
|
88
92
|
end
|
89
93
|
end
|
90
94
|
end
|
@@ -33,62 +33,62 @@ module WordsCounted
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it "splits words" do
|
36
|
-
expect(counter.words).to eq(%w[
|
36
|
+
expect(counter.words).to eq(%w[we are all in the gutter but some of us are looking at the stars])
|
37
37
|
end
|
38
38
|
|
39
39
|
it "removes special characters" do
|
40
40
|
counter = Counter.new("Hello! # $ % 12345 * & % How do you do?")
|
41
|
-
expect(counter.words).to eq(%w[
|
41
|
+
expect(counter.words).to eq(%w[hello how do you do])
|
42
42
|
end
|
43
43
|
|
44
44
|
it "counts hyphenated words as one" do
|
45
45
|
counter = Counter.new("I am twenty-two.")
|
46
|
-
expect(counter.words).to eq(%w[
|
46
|
+
expect(counter.words).to eq(%w[i am twenty-two])
|
47
47
|
end
|
48
48
|
|
49
49
|
it "does not split words on apostrophe" do
|
50
50
|
counter = Counter.new("Bust 'em! Them be Jim's bastards'.")
|
51
|
-
expect(counter.words).to eq(%w[
|
51
|
+
expect(counter.words).to eq(%w[bust 'em them be jim's bastards'])
|
52
52
|
end
|
53
53
|
|
54
54
|
it "does not split on unicode chars" do
|
55
55
|
counter = Counter.new("São Paulo")
|
56
|
-
expect(counter.words).to eq(%w[
|
56
|
+
expect(counter.words).to eq(%w[são paulo])
|
57
57
|
end
|
58
58
|
|
59
59
|
it "it accepts a string filter" do
|
60
60
|
counter = Counter.new("That was magnificent, Trevor.", exclude: "magnificent")
|
61
|
-
expect(counter.words).to eq(%w[
|
61
|
+
expect(counter.words).to eq(%w[that was trevor])
|
62
62
|
end
|
63
63
|
|
64
64
|
it "it accepts a string filter with multiple words" do
|
65
65
|
counter = Counter.new("That was magnificent, Trevor.", exclude: "was magnificent")
|
66
|
-
expect(counter.words).to eq(%w[
|
66
|
+
expect(counter.words).to eq(%w[that trevor])
|
67
67
|
end
|
68
68
|
|
69
69
|
it "filters words in uppercase when using a string filter" do
|
70
70
|
counter = Counter.new("That was magnificent, Trevor.", exclude: "Magnificent")
|
71
|
-
expect(counter.words).to eq(%w[
|
71
|
+
expect(counter.words).to eq(%w[that was trevor])
|
72
72
|
end
|
73
73
|
|
74
74
|
it "accepts a regexp filter" do
|
75
75
|
counter = Counter.new("That was magnificent, Trevor.", exclude: /magnificent/i)
|
76
|
-
expect(counter.words).to eq(%w[
|
76
|
+
expect(counter.words).to eq(%w[that was trevor])
|
77
77
|
end
|
78
78
|
|
79
79
|
it "accepts an array filter" do
|
80
80
|
counter = Counter.new("That was magnificent, Trevor.", exclude: ['That', 'was'])
|
81
|
-
expect(counter.words).to eq(%w[magnificent
|
81
|
+
expect(counter.words).to eq(%w[magnificent trevor])
|
82
82
|
end
|
83
83
|
|
84
84
|
it "accepts a lambda filter" do
|
85
|
-
counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) {w == '
|
86
|
-
expect(counter.words).to eq(%w[was magnificent
|
85
|
+
counter = Counter.new("That was magnificent, Trevor.", exclude: ->(w) { w == 'that' })
|
86
|
+
expect(counter.words).to eq(%w[was magnificent trevor])
|
87
87
|
end
|
88
88
|
|
89
89
|
it "accepts a custom regexp" do
|
90
90
|
counter = Counter.new("I am 007.", regexp: /[\p{Alnum}\-']+/)
|
91
|
-
expect(counter.words).to eq(["
|
91
|
+
expect(counter.words).to eq(["i", "am", "007"])
|
92
92
|
end
|
93
93
|
|
94
94
|
it "char_count should be calculated after the filter is applied" do
|
@@ -143,7 +143,7 @@ module WordsCounted
|
|
143
143
|
|
144
144
|
it "returns a hash of word lengths" do
|
145
145
|
counter = Counter.new("One two three.")
|
146
|
-
expect(counter.word_lengths).to eq({ "
|
146
|
+
expect(counter.word_lengths).to eq({ "one" => 3, "two" => 3, "three" => 5 })
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
@@ -154,7 +154,7 @@ module WordsCounted
|
|
154
154
|
|
155
155
|
it "returns a two dimensional array sorted by descending word length" do
|
156
156
|
counter = Counter.new("I am not certain of that")
|
157
|
-
expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["
|
157
|
+
expect(counter.sorted_word_lengths).to eq([ ["certain", 7], ["that", 4], ["not", 3], ["of", 2], ["am", 2], ["i", 1] ])
|
158
158
|
end
|
159
159
|
end
|
160
160
|
|
@@ -165,7 +165,7 @@ module WordsCounted
|
|
165
165
|
|
166
166
|
it "returns the longest words" do
|
167
167
|
counter = Counter.new("Those whom the gods love grow young.")
|
168
|
-
expect(counter.longest_words).to eq([["
|
168
|
+
expect(counter.longest_words).to eq([["those", 5],["young", 5]])
|
169
169
|
end
|
170
170
|
end
|
171
171
|
|
@@ -218,6 +218,18 @@ module WordsCounted
|
|
218
218
|
it "returns the number of unique words" do
|
219
219
|
expect(counter.unique_word_count).to eq(13)
|
220
220
|
end
|
221
|
+
|
222
|
+
it "is case insensitive" do
|
223
|
+
counter = Counter.new("Up down. Down up.")
|
224
|
+
expect(counter.unique_word_count).to eq(2)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
describe "count" do
|
230
|
+
it "returns count for a single word" do
|
231
|
+
counter = Counter.new("I am so clever that sometimes I don't understand a single word of what I am saying.")
|
232
|
+
expect(counter.count("i")).to eq(3)
|
221
233
|
end
|
222
234
|
end
|
223
235
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: words_counted
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mohamad El-Husseini
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|