word_count_analyzer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +554 -0
  8. data/Rakefile +2 -0
  9. data/lib/word_count_analyzer.rb +14 -0
  10. data/lib/word_count_analyzer/analyzer.rb +34 -0
  11. data/lib/word_count_analyzer/contraction.rb +176 -0
  12. data/lib/word_count_analyzer/counter.rb +230 -0
  13. data/lib/word_count_analyzer/date.rb +149 -0
  14. data/lib/word_count_analyzer/ellipsis.rb +48 -0
  15. data/lib/word_count_analyzer/hyperlink.rb +53 -0
  16. data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
  17. data/lib/word_count_analyzer/number.rb +23 -0
  18. data/lib/word_count_analyzer/numbered_list.rb +61 -0
  19. data/lib/word_count_analyzer/punctuation.rb +52 -0
  20. data/lib/word_count_analyzer/slash.rb +84 -0
  21. data/lib/word_count_analyzer/version.rb +3 -0
  22. data/lib/word_count_analyzer/xhtml.rb +26 -0
  23. data/spec/spec_helper.rb +1 -0
  24. data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
  25. data/spec/word_count_analyzer/contraction_spec.rb +124 -0
  26. data/spec/word_count_analyzer/counter_spec.rb +647 -0
  27. data/spec/word_count_analyzer/date_spec.rb +257 -0
  28. data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
  29. data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
  30. data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
  31. data/spec/word_count_analyzer/number_spec.rb +63 -0
  32. data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
  33. data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
  34. data/spec/word_count_analyzer/slash_spec.rb +105 -0
  35. data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
  36. data/word_count_analyzer.gemspec +26 -0
  37. metadata +153 -0
@@ -0,0 +1,3 @@
1
+ module WordCountAnalyzer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ module WordCountAnalyzer
2
+ class Xhtml
3
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
4
+ XHTML_REGEX = /<\/?[^>]*>/
5
+ attr_reader :string
6
+ def initialize(string:)
7
+ @string = string
8
+ end
9
+
10
+ def includes_xhtml?
11
+ !(string !~ XHTML_REGEX)
12
+ end
13
+
14
+ def replace
15
+ string.gsub(XHTML_REGEX, ' ')
16
+ end
17
+
18
+ def count_difference_word_boundary
19
+ string.split(/\s+/).size - replace.strip.split(/\s+/).size
20
+ end
21
+
22
+ def occurences
23
+ string.gsub(XHTML_REGEX, ' wsxhtmlword ').scan(/wsxhtmlword/).size / 2
24
+ end
25
+ end
26
+ end
@@ -0,0 +1 @@
1
+ require 'word_count_analyzer'
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Analyzer do
4
+ context '#analysis' do
5
+ it 'should analyze the gray areas' do
6
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
7
+ ws = WordCountAnalyzer::Analyzer.new(text: text)
8
+ expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>2, "contraction"=>4, "hyphenated_word"=>2, "date"=>2, "number"=>1, "numbered_list"=>3, "xhtml"=>1, "forward_slash"=>1, "backslash"=>1, "dotted_line"=>1, "dashed_line"=>1, "underscore"=>1, "stray_punctuation"=>5})
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,124 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Contraction do
4
+ before do
5
+ @tgr = EngTagger.new
6
+ end
7
+ context '#contraction?' do
8
+ it 'returns true if the token is a contraction' do
9
+ token = "when'd"
10
+ following_token = nil
11
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
12
+ expect(ws.contraction?).to eq(true)
13
+ end
14
+
15
+ it 'returns true if the token is an irregular contraction' do
16
+ token = "o'clock"
17
+ following_token = nil
18
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
19
+ expect(ws.contraction?).to eq(true)
20
+ end
21
+
22
+ it 'returns false if the token is a possesive and not a contraction' do
23
+ token = "Bob's"
24
+ following_token = "car"
25
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
26
+ expect(ws.contraction?).to eq(false)
27
+ end
28
+
29
+ it 'returns true if the token is a contraction' do
30
+ token = "Bob's"
31
+ following_token = "the"
32
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
33
+ expect(ws.contraction?).to eq(true)
34
+ end
35
+
36
+ it 'returns true if the token is a contraction' do
37
+ token = "Bob's"
38
+ following_token = "open"
39
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
40
+ expect(ws.contraction?).to eq(true)
41
+ end
42
+
43
+ it 'returns true if the token is a contraction' do
44
+ token = "Don't"
45
+ following_token = "count"
46
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
47
+ expect(ws.contraction?).to eq(true)
48
+ end
49
+ end
50
+
51
+ context '#expanded_count' do
52
+ it 'returns the count of the contraction expanded #001' do
53
+ token = "when'd"
54
+ following_token = nil
55
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
56
+ expect(ws.expanded_count).to eq(2)
57
+ end
58
+
59
+ it 'returns the count of the contraction expanded #002' do
60
+ token = "o'clock"
61
+ following_token = nil
62
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
63
+ expect(ws.expanded_count).to eq(3)
64
+ end
65
+
66
+ it 'returns the count of the contraction expanded #003' do
67
+ token = "Bob's"
68
+ following_token = "car"
69
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
70
+ expect(ws.expanded_count).to eq(1)
71
+ end
72
+
73
+ it 'returns the count of the contraction expanded #004' do
74
+ token = "Bob's"
75
+ following_token = "the"
76
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
77
+ expect(ws.expanded_count).to eq(2)
78
+ end
79
+
80
+ it 'returns the count of the contraction expanded #005' do
81
+ token = "cat-o'-nine-tails"
82
+ following_token = nil
83
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_one')
84
+ expect(ws.expanded_count).to eq(1)
85
+ end
86
+
87
+ it 'returns the count of the contraction expanded #006' do
88
+ token = "cat-o'-nine-tails"
89
+ following_token = nil
90
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_multiple')
91
+ expect(ws.expanded_count).to eq(4)
92
+ end
93
+ end
94
+
95
+ context '#replace' do
96
+ it 'replaces the token with the contraction expanded #001' do
97
+ token = "cat-o'-nine-tails"
98
+ following_token = nil
99
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
100
+ expect(ws.replace).to eq("cat-of-nine-tails")
101
+ end
102
+
103
+ it 'replaces the token with the contraction expanded #002' do
104
+ token = "Bob's"
105
+ following_token = "the"
106
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
107
+ expect(ws.replace).to eq(" word word ")
108
+ end
109
+
110
+ it 'replaces the token with the contraction expanded #003' do
111
+ token = "don't"
112
+ following_token = nil
113
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
114
+ expect(ws.replace).to eq("do not")
115
+ end
116
+
117
+ it 'replaces the token with the contraction expanded #004' do
118
+ token = "hello"
119
+ following_token = nil
120
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
121
+ expect(ws.replace).to eq("hello")
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,647 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Counter do
4
+ context 'ellipsis' do
5
+ it 'handles an invalid ellipsis argument value' do
6
+ text = 'hello world.'
7
+ ws = WordCountAnalyzer::Counter.new(text: text, ellipsis: 'hello')
8
+ expect { ws.count }.to raise_error('The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`')
9
+ end
10
+
11
+ it 'ignores ellipses in the word count' do
12
+ text = 'hello world ... what day is it.'
13
+ ws = WordCountAnalyzer::Counter.new(
14
+ text: text,
15
+ ellipsis: 'ignore'
16
+ )
17
+ expect(ws.count).to eq(6)
18
+ end
19
+
20
+ it 'does not ignore ellipses in the word count' do
21
+ text = 'hello world ... what day is it.'
22
+ ws = WordCountAnalyzer::Counter.new(
23
+ text: text,
24
+ ellipsis: 'no_special_treatment'
25
+ )
26
+ expect(ws.count).to eq(7)
27
+ end
28
+
29
+ it 'does not ignore ellipses in the word count' do
30
+ text = 'hello world... what day is it.'
31
+ ws = WordCountAnalyzer::Counter.new(
32
+ text: text,
33
+ ellipsis: 'no_special_treatment'
34
+ )
35
+ expect(ws.count).to eq(6)
36
+ end
37
+
38
+ it 'sets ignore as the default option' do
39
+ text = 'hello world ... what day is it.'
40
+ ws = WordCountAnalyzer::Counter.new(text: text)
41
+ expect(ws.count).to eq(6)
42
+ end
43
+ end
44
+
45
+ context 'hyperlink' do
46
+ it 'handles an invalid hyperlink argument value' do
47
+ text = 'hello world.'
48
+ ws = WordCountAnalyzer::Counter.new(text: text, hyperlink: 'hello')
49
+ expect { ws.count }.to raise_error('The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`')
50
+ end
51
+
52
+ it 'counts a hyperlink as one word in the word count' do
53
+ text = 'The site address is http://www.example.com she said.'
54
+ ws = WordCountAnalyzer::Counter.new(
55
+ text: text,
56
+ hyperlink: 'count_as_one'
57
+ )
58
+ expect(ws.count).to eq(7)
59
+ end
60
+
61
+ it 'counts a hyperlink as one word in the word count' do
62
+ text = 'The site address is http://www.example.com she said.'
63
+ ws = WordCountAnalyzer::Counter.new(
64
+ text: text,
65
+ hyperlink: 'split_at_period',
66
+ forward_slash: 'count_as_one'
67
+ )
68
+ expect(ws.count).to eq(9)
69
+ end
70
+
71
+ it 'does not search for hyperlinks' do
72
+ text = 'The site address is http://www.example.com she said.'
73
+ ws = WordCountAnalyzer::Counter.new(
74
+ text: text,
75
+ hyperlink: 'no_special_treatment'
76
+ )
77
+ expect(ws.count).to eq(8)
78
+ end
79
+
80
+ it 'sets count_as_one as the default option' do
81
+ text = 'The site address is http://www.example.com she said.'
82
+ ws = WordCountAnalyzer::Counter.new(text: text)
83
+ expect(ws.count).to eq(7)
84
+ end
85
+ end
86
+
87
+ context 'contraction' do
88
+ it 'handles an invalid contraction argument value' do
89
+ text = 'hello world.'
90
+ ws = WordCountAnalyzer::Counter.new(text: text, contraction: 'hello')
91
+ expect { ws.count }.to raise_error('The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
92
+ end
93
+
94
+ it 'counts a contraction as one word in the word count' do
95
+ text = "Don't do that."
96
+ ws = WordCountAnalyzer::Counter.new(
97
+ text: text,
98
+ contraction: 'count_as_one'
99
+ )
100
+ expect(ws.count).to eq(3)
101
+ end
102
+
103
+ it 'splits a contraction into its separate words for the word count' do
104
+ text = "Don't do that."
105
+ ws = WordCountAnalyzer::Counter.new(
106
+ text: text,
107
+ contraction: 'count_as_multiple'
108
+ )
109
+ expect(ws.count).to eq(4)
110
+ end
111
+
112
+ it 'sets count_as_one as the default option' do
113
+ text = "Don't do that."
114
+ ws = WordCountAnalyzer::Counter.new(text: text)
115
+ expect(ws.count).to eq(3)
116
+ end
117
+ end
118
+
119
+ context 'hyphenated_word' do
120
+ it 'handles an invalid hyphenated_word argument value' do
121
+ text = 'hello world.'
122
+ ws = WordCountAnalyzer::Counter.new(text: text, hyphenated_word: 'hello')
123
+ expect { ws.count }.to raise_error('The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
124
+ end
125
+
126
+ it 'counts a hyphenated word as one word in the word count' do
127
+ text = 'He has a devil-may-care attitude.'
128
+ ws = WordCountAnalyzer::Counter.new(
129
+ text: text,
130
+ hyphenated_word: 'count_as_one'
131
+ )
132
+ expect(ws.count).to eq(5)
133
+ end
134
+
135
+ it 'splits a hyphenated word into its separate words for the word count' do
136
+ text = 'He has a devil-may-care attitude.'
137
+ ws = WordCountAnalyzer::Counter.new(
138
+ text: text,
139
+ hyphenated_word: 'count_as_multiple'
140
+ )
141
+ expect(ws.count).to eq(7)
142
+ end
143
+
144
+ it 'sets count_as_one as the default option' do
145
+ text = 'He has a devil-may-care attitude.'
146
+ ws = WordCountAnalyzer::Counter.new(text: text)
147
+ expect(ws.count).to eq(5)
148
+ end
149
+ end
150
+
151
+ context 'date' do
152
+ it 'handles an invalid date argument value' do
153
+ text = 'hello world.'
154
+ ws = WordCountAnalyzer::Counter.new(text: text, date: 'hello')
155
+ expect { ws.count }.to raise_error('The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`')
156
+ end
157
+
158
+ it 'ignores date placeables' do
159
+ text = 'Today is Tues. March 3rd, 2011.'
160
+ ws = WordCountAnalyzer::Counter.new(
161
+ text: text,
162
+ date: 'no_special_treatment'
163
+ )
164
+ expect(ws.count).to eq(6)
165
+ end
166
+
167
+ it 'counts a date placeable as one word in the word count' do
168
+ text = 'Today is Tues. March 3rd, 2011.'
169
+ ws = WordCountAnalyzer::Counter.new(
170
+ text: text,
171
+ date: 'count_as_one'
172
+ )
173
+ expect(ws.count).to eq(3)
174
+ end
175
+
176
+ it 'sets count_as_one as the default option' do
177
+ text = 'Today is Tues. March 3rd, 2011.'
178
+ ws = WordCountAnalyzer::Counter.new(text: text)
179
+ expect(ws.count).to eq(6)
180
+ end
181
+ end
182
+
183
+ context 'number' do
184
+ it 'handles an invalid number argument value' do
185
+ text = 'hello world.'
186
+ ws = WordCountAnalyzer::Counter.new(text: text, number: 'hello')
187
+ expect { ws.count }.to raise_error('The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
188
+ end
189
+
190
+ it 'counts a number as a word' do
191
+ text = 'There is $300 in the safe. The password is 1234.'
192
+ ws = WordCountAnalyzer::Counter.new(
193
+ text: text,
194
+ number: 'count'
195
+ )
196
+ expect(ws.count).to eq(10)
197
+ end
198
+
199
+ it 'ignores numbers in the word count' do
200
+ text = 'There is $300 in the safe. The password is 1234.'
201
+ ws = WordCountAnalyzer::Counter.new(
202
+ text: text,
203
+ number: 'ignore'
204
+ )
205
+ expect(ws.count).to eq(8)
206
+ end
207
+
208
+ it 'sets count as the default option' do
209
+ text = 'There is $300 in the safe. The password is 1234.'
210
+ ws = WordCountAnalyzer::Counter.new(text: text)
211
+ expect(ws.count).to eq(10)
212
+ end
213
+ end
214
+
215
+ context 'number_list' do
216
+ it 'handles an invalid number argument value' do
217
+ text = 'hello world.'
218
+ ws = WordCountAnalyzer::Counter.new(text: text, numbered_list: 'hello')
219
+ expect { ws.count }.to raise_error('The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
220
+ end
221
+
222
+ it 'counts a numbered list number as a word' do
223
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
224
+ ws = WordCountAnalyzer::Counter.new(
225
+ text: text,
226
+ numbered_list: 'count'
227
+ )
228
+ expect(ws.count).to eq(17)
229
+ end
230
+
231
+ it 'ignores numbered list numbers' do
232
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
233
+ ws = WordCountAnalyzer::Counter.new(
234
+ text: text,
235
+ numbered_list: 'ignore'
236
+ )
237
+ expect(ws.count).to eq(14)
238
+ end
239
+
240
+ it 'sets count as the default option' do
241
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
242
+ ws = WordCountAnalyzer::Counter.new(text: text)
243
+ expect(ws.count).to eq(17)
244
+ end
245
+ end
246
+
247
+ context 'xhtml' do
248
+ it 'handles an invalid number argument value' do
249
+ text = 'hello world.'
250
+ ws = WordCountAnalyzer::Counter.new(text: text, xhtml: 'hello')
251
+ expect { ws.count }.to raise_error('The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`')
252
+ end
253
+
254
+ it 'removes all xhtml from the text' do
255
+ text = "<span class='orange-text'>Hello world</span>"
256
+ ws = WordCountAnalyzer::Counter.new(
257
+ text: text,
258
+ xhtml: 'remove'
259
+ )
260
+ expect(ws.count).to eq(2)
261
+ end
262
+
263
+ it 'keeps xhtml in the text' do
264
+ text = "<span class='orange-text'>Hello world</span>"
265
+ ws = WordCountAnalyzer::Counter.new(
266
+ text: text,
267
+ xhtml: 'keep',
268
+ forward_slash: 'count_as_one'
269
+ )
270
+ expect(ws.count).to eq(3)
271
+ end
272
+
273
+ it 'keeps xhtml in the text' do
274
+ text = "<span class='orange-text'>Hello world</span>"
275
+ ws = WordCountAnalyzer::Counter.new(
276
+ text: text,
277
+ xhtml: 'keep'
278
+ )
279
+ expect(ws.count).to eq(4)
280
+ end
281
+
282
+ it 'sets remove as the default option' do
283
+ text = "<span class='orange-text'>Hello world</span>"
284
+ ws = WordCountAnalyzer::Counter.new(text: text)
285
+ expect(ws.count).to eq(2)
286
+ end
287
+ end
288
+
289
+ context 'forward_slash' do
290
+ it 'handles an invalid number argument value' do
291
+ text = 'hello world.'
292
+ ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'hello')
293
+ expect { ws.count }.to raise_error('The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`')
294
+ end
295
+
296
+ it 'counts a forward slash as multiple words (except dates) #001' do
297
+ text = "She/he/it said hello. 4/22/2013."
298
+ ws = WordCountAnalyzer::Counter.new(
299
+ text: text,
300
+ forward_slash: 'count_as_multiple_except_dates'
301
+ )
302
+ expect(ws.count).to eq(6)
303
+ end
304
+
305
+ it 'counts a forward slash as multiple words #002' do
306
+ text = "She/he/it said hello. 4/22/2013."
307
+ ws = WordCountAnalyzer::Counter.new(
308
+ text: text,
309
+ forward_slash: 'count_as_multiple'
310
+ )
311
+ expect(ws.count).to eq(8)
312
+ end
313
+
314
+ it 'counts a forward slash as multiple words #003' do
315
+ text = "She/he/it said hello. 4/22/2013."
316
+ ws = WordCountAnalyzer::Counter.new(
317
+ text: text,
318
+ forward_slash: 'count_as_multiple',
319
+ date: 'count_as_one'
320
+ )
321
+ expect(ws.count).to eq(6)
322
+ end
323
+
324
+ it 'counts a forward slash as one word' do
325
+ text = "She/he/it said hello."
326
+ ws = WordCountAnalyzer::Counter.new(
327
+ text: text,
328
+ forward_slash: 'count_as_one'
329
+ )
330
+ expect(ws.count).to eq(3)
331
+ end
332
+
333
+ it 'sets count_as_multiple_except_dates as the default option' do
334
+ text = "She/he/it said hello. 4/22/2013."
335
+ ws = WordCountAnalyzer::Counter.new(text: text)
336
+ expect(ws.count).to eq(6)
337
+ end
338
+ end
339
+
340
+ context 'backslash' do
341
+ it 'handles an invalid number argument value' do
342
+ text = 'hello world.'
343
+ ws = WordCountAnalyzer::Counter.new(text: text, backslash: 'hello')
344
+ expect { ws.count }.to raise_error('The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
345
+ end
346
+
347
+ it 'counts a token with backslashes as one word' do
348
+ text = 'The file location is c:\Users\johndoe'
349
+ ws = WordCountAnalyzer::Counter.new(
350
+ text: text,
351
+ backslash: 'count_as_one'
352
+ )
353
+ expect(ws.count).to eq(5)
354
+ end
355
+
356
+ it 'counts a token with backslashes as multiple words' do
357
+ text = 'The file location is c:\Users\johndoe'
358
+ ws = WordCountAnalyzer::Counter.new(
359
+ text: text,
360
+ backslash: 'count_as_multiple'
361
+ )
362
+ expect(ws.count).to eq(7)
363
+ end
364
+
365
+ it 'sets count_as_one as the default option' do
366
+ text = 'The file location is c:\Users\johndoe'
367
+ ws = WordCountAnalyzer::Counter.new(text: text)
368
+ expect(ws.count).to eq(5)
369
+ end
370
+ end
371
+
372
+ context 'dotted_line' do
373
+ it 'handles an invalid number argument value' do
374
+ text = 'hello world.'
375
+ ws = WordCountAnalyzer::Counter.new(text: text, dotted_line: 'hello')
376
+ expect { ws.count }.to raise_error('The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
377
+ end
378
+
379
+ it 'ignores continuous strings of dots in the word count' do
380
+ text = 'Here is one …………………………………………………………………… and another ......'
381
+ ws = WordCountAnalyzer::Counter.new(
382
+ text: text,
383
+ dotted_line: 'ignore'
384
+ )
385
+ expect(ws.count).to eq(5)
386
+ end
387
+
388
+ it 'counts a continuous string of dots as a word' do
389
+ text = 'Here is one …………………………………………………………………… and another ......'
390
+ ws = WordCountAnalyzer::Counter.new(
391
+ text: text,
392
+ dotted_line: 'count'
393
+ )
394
+ expect(ws.count).to eq(7)
395
+ end
396
+
397
+ it 'sets ignore as the default option' do
398
+ text = 'Here is one …………………………………………………………………… and another ......'
399
+ ws = WordCountAnalyzer::Counter.new(text: text)
400
+ expect(ws.count).to eq(5)
401
+ end
402
+ end
403
+
404
+ context 'dashed_line' do
405
+ it 'handles an invalid number argument value' do
406
+ text = 'hello world.'
407
+ ws = WordCountAnalyzer::Counter.new(text: text, dashed_line: 'hello')
408
+ expect { ws.count }.to raise_error('The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
409
+ end
410
+
411
+ it 'ignores continuous strings of dashes in the word count' do
412
+ text = 'Here is one ----- and another -----'
413
+ ws = WordCountAnalyzer::Counter.new(
414
+ text: text,
415
+ dashed_line: 'ignore'
416
+ )
417
+ expect(ws.count).to eq(5)
418
+ end
419
+
420
+ it 'counts a continuous string of dashes as a word' do
421
+ text = 'Here is one ----- and another -----'
422
+ ws = WordCountAnalyzer::Counter.new(
423
+ text: text,
424
+ dashed_line: 'count'
425
+ )
426
+ expect(ws.count).to eq(7)
427
+ end
428
+
429
+ it 'sets ignore as the default option' do
430
+ text = 'Here is one ----- and another -----'
431
+ ws = WordCountAnalyzer::Counter.new(text: text)
432
+ expect(ws.count).to eq(5)
433
+ end
434
+ end
435
+
436
+ context 'underscore' do
437
+ it 'handles an invalid number argument value' do
438
+ text = 'hello world.'
439
+ ws = WordCountAnalyzer::Counter.new(text: text, underscore: 'hello')
440
+ expect { ws.count }.to raise_error('The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
441
+ end
442
+
443
+ it 'ignores continuous strings of underscores in the word count' do
444
+ text = "Here is one ______ and another ______"
445
+ ws = WordCountAnalyzer::Counter.new(
446
+ text: text,
447
+ underscore: 'ignore'
448
+ )
449
+ expect(ws.count).to eq(5)
450
+ end
451
+
452
+ it 'counts a continuous string of underscores as a word' do
453
+ text = 'Here is one ______ and another ______'
454
+ ws = WordCountAnalyzer::Counter.new(
455
+ text: text,
456
+ underscore: 'count'
457
+ )
458
+ expect(ws.count).to eq(7)
459
+ end
460
+
461
+ it 'sets ignore as the default option' do
462
+ text = 'Here is one ______ and another ______'
463
+ ws = WordCountAnalyzer::Counter.new(text: text)
464
+ expect(ws.count).to eq(5)
465
+ end
466
+ end
467
+
468
+ context 'stray_punctuation' do
469
+ it 'handles an invalid number argument value' do
470
+ text = 'hello world.'
471
+ ws = WordCountAnalyzer::Counter.new(text: text, stray_punctuation: 'hello')
472
+ expect { ws.count }.to raise_error('The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
473
+ end
474
+
475
+ it 'ignores continuous strings of underscores in the word count' do
476
+ text = 'Hello world ? This is another - sentence .'
477
+ ws = WordCountAnalyzer::Counter.new(
478
+ text: text,
479
+ stray_punctuation: 'ignore'
480
+ )
481
+ expect(ws.count).to eq(6)
482
+ end
483
+
484
+ it 'counts a continuous string of underscores as a word' do
485
+ text = 'Hello world ? This is another - sentence .'
486
+ ws = WordCountAnalyzer::Counter.new(
487
+ text: text,
488
+ stray_punctuation: 'count'
489
+ )
490
+ expect(ws.count).to eq(9)
491
+ end
492
+
493
+ it 'sets ignore as the default option' do
494
+ text = 'Hello world ? This is another - sentence .'
495
+ ws = WordCountAnalyzer::Counter.new(text: text)
496
+ expect(ws.count).to eq(6)
497
+ end
498
+ end
499
+
500
+ it 'counts the words in a string #001' do
501
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
502
+ ws = WordCountAnalyzer::Counter.new(
503
+ text: text,
504
+ ellipsis: 'ignore',
505
+ hyperlink: 'count_as_one',
506
+ contraction: 'count_as_one',
507
+ hyphenated_word: 'count_as_one',
508
+ date: 'no_special_treatment',
509
+ number: 'count',
510
+ numbered_list: 'count',
511
+ xhtml: 'remove',
512
+ forward_slash: 'count_as_one',
513
+ backslash: 'count_as_one',
514
+ dotted_line: 'ignore',
515
+ dashed_line: 'ignore',
516
+ underscore: 'ignore',
517
+ stray_punctuation: 'ignore'
518
+ )
519
+ expect(ws.count).to eq(62)
520
+ end
521
+
522
+ it 'counts the words in a string #002' do
523
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
524
+ ws = WordCountAnalyzer::Counter.new(
525
+ text: text,
526
+ ellipsis: 'no_special_treatment',
527
+ hyperlink: 'no_special_treatment',
528
+ contraction: 'count_as_multiple',
529
+ hyphenated_word: 'count_as_multiple',
530
+ date: 'count_as_one',
531
+ number: 'ignore',
532
+ numbered_list: 'ignore',
533
+ xhtml: 'keep',
534
+ forward_slash: 'count_as_multiple',
535
+ backslash: 'count_as_multiple',
536
+ dotted_line: 'count',
537
+ dashed_line: 'count',
538
+ underscore: 'count',
539
+ stray_punctuation: 'count'
540
+ )
541
+ expect(ws.count).to eq(77)
542
+ end
543
+
544
+ it 'counts the words in a string #003' do
545
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
546
+ ws = WordCountAnalyzer::Counter.new(text: text)
547
+ expect(ws.count).to eq(64)
548
+ end
549
+
550
+ it 'counts the words in a string #004' do
551
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
552
+ ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'count_as_multiple')
553
+ expect(ws.count).to eq(66)
554
+ end
555
+
556
+ context 'Pages Word Count' do
557
+ it 'reverse engineers Pages word count #001' do
558
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
559
+ ws = WordCountAnalyzer::Counter.new(
560
+ text: text,
561
+ ellipsis: 'no_special_treatment',
562
+ hyperlink: 'split_at_period',
563
+ contraction: 'count_as_one',
564
+ hyphenated_word: 'count_as_multiple',
565
+ date: 'no_special_treatment',
566
+ number: 'count',
567
+ numbered_list: 'count',
568
+ xhtml: 'keep',
569
+ forward_slash: 'count_as_multiple',
570
+ backslash: 'count_as_multiple',
571
+ dotted_line: 'ignore',
572
+ dashed_line: 'ignore',
573
+ underscore: 'ignore',
574
+ stray_punctuation: 'ignore'
575
+ )
576
+ expect(ws.count).to eq(79)
577
+ end
578
+
579
+ it 'reverse engineers Pages word count #002' do
580
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
581
+ ws = WordCountAnalyzer::Counter.new(text: text)
582
+ expect(ws.pages_count).to eq(79)
583
+ end
584
+ end
585
+
586
+ context 'Microsoft Word Count' do
587
+ it 'reverse engineers the Microsoft Word / wc (Unix) word count #001' do
588
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
589
+ ws = WordCountAnalyzer::Counter.new(
590
+ text: text,
591
+ ellipsis: 'no_special_treatment',
592
+ hyperlink: 'count_as_one',
593
+ contraction: 'count_as_one',
594
+ hyphenated_word: 'count_as_one',
595
+ date: 'no_special_treatment',
596
+ number: 'count',
597
+ numbered_list: 'count',
598
+ xhtml: 'keep',
599
+ forward_slash: 'count_as_one',
600
+ backslash: 'count_as_one',
601
+ dotted_line: 'count',
602
+ dashed_line: 'count',
603
+ underscore: 'count',
604
+ stray_punctuation: 'count'
605
+ )
606
+ expect(ws.count).to eq(71)
607
+ end
608
+
609
+ it 'reverse engineers the Microsoft Word / wc (Unix) word count #002' do
610
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
611
+ ws = WordCountAnalyzer::Counter.new(text: text)
612
+ expect(ws.mword_count).to eq(71)
613
+ end
614
+ end
615
+
616
+ context 'example sentences' do
617
+ it 'String with common words (no edge cases) #001' do
618
+ ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains nothing crazy.')
619
+ expect(ws.count).to eq(5)
620
+ end
621
+
622
+ it 'String with a number #002' do
623
+ ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains 1 number.')
624
+ expect(ws.count).to eq(5)
625
+ end
626
+
627
+ it 'String with a date #003' do
628
+ ws = WordCountAnalyzer::Counter.new(text: 'Today is Monday, April 4th, 2011.')
629
+ expect(ws.count).to eq(6)
630
+ end
631
+
632
+ it 'does not split on unicode chars' do
633
+ ws = WordCountAnalyzer::Counter.new(text: 'São Paulo')
634
+ expect(ws.count).to eq(2)
635
+ end
636
+
637
+ it 'should not count HTML tags' do
638
+ ws = WordCountAnalyzer::Counter.new(text: "<a href=\"http://thefamousfox.com\">the brown fox</a> jumped over the lazy dog")
639
+ expect(ws.count).to eq(8)
640
+ end
641
+
642
+ it 'should handle special characters' do
643
+ ws = WordCountAnalyzer::Counter.new(text: "the \"brown\" fox 'jumped' | over \\ the / lazy dog")
644
+ expect(ws.count).to eq(8)
645
+ end
646
+ end
647
+ end