word_count_analyzer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +554 -0
  8. data/Rakefile +2 -0
  9. data/lib/word_count_analyzer.rb +14 -0
  10. data/lib/word_count_analyzer/analyzer.rb +34 -0
  11. data/lib/word_count_analyzer/contraction.rb +176 -0
  12. data/lib/word_count_analyzer/counter.rb +230 -0
  13. data/lib/word_count_analyzer/date.rb +149 -0
  14. data/lib/word_count_analyzer/ellipsis.rb +48 -0
  15. data/lib/word_count_analyzer/hyperlink.rb +53 -0
  16. data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
  17. data/lib/word_count_analyzer/number.rb +23 -0
  18. data/lib/word_count_analyzer/numbered_list.rb +61 -0
  19. data/lib/word_count_analyzer/punctuation.rb +52 -0
  20. data/lib/word_count_analyzer/slash.rb +84 -0
  21. data/lib/word_count_analyzer/version.rb +3 -0
  22. data/lib/word_count_analyzer/xhtml.rb +26 -0
  23. data/spec/spec_helper.rb +1 -0
  24. data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
  25. data/spec/word_count_analyzer/contraction_spec.rb +124 -0
  26. data/spec/word_count_analyzer/counter_spec.rb +647 -0
  27. data/spec/word_count_analyzer/date_spec.rb +257 -0
  28. data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
  29. data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
  30. data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
  31. data/spec/word_count_analyzer/number_spec.rb +63 -0
  32. data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
  33. data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
  34. data/spec/word_count_analyzer/slash_spec.rb +105 -0
  35. data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
  36. data/word_count_analyzer.gemspec +26 -0
  37. metadata +153 -0
@@ -0,0 +1,3 @@
1
+ module WordCountAnalyzer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ module WordCountAnalyzer
2
+ class Xhtml
3
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
4
+ XHTML_REGEX = /<\/?[^>]*>/
5
+ attr_reader :string
6
+ def initialize(string:)
7
+ @string = string
8
+ end
9
+
10
+ def includes_xhtml?
11
+ !(string !~ XHTML_REGEX)
12
+ end
13
+
14
+ def replace
15
+ string.gsub(XHTML_REGEX, ' ')
16
+ end
17
+
18
+ def count_difference_word_boundary
19
+ string.split(/\s+/).size - replace.strip.split(/\s+/).size
20
+ end
21
+
22
+ def occurences
23
+ string.gsub(XHTML_REGEX, ' wsxhtmlword ').scan(/wsxhtmlword/).size / 2
24
+ end
25
+ end
26
+ end
@@ -0,0 +1 @@
1
+ require 'word_count_analyzer'
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Analyzer do
4
+ context '#analysis' do
5
+ it 'should analyze the gray areas' do
6
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
7
+ ws = WordCountAnalyzer::Analyzer.new(text: text)
8
+ expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>2, "contraction"=>4, "hyphenated_word"=>2, "date"=>2, "number"=>1, "numbered_list"=>3, "xhtml"=>1, "forward_slash"=>1, "backslash"=>1, "dotted_line"=>1, "dashed_line"=>1, "underscore"=>1, "stray_punctuation"=>5})
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,124 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Contraction do
4
+ before do
5
+ @tgr = EngTagger.new
6
+ end
7
+ context '#contraction?' do
8
+ it 'returns true if the token is a contraction' do
9
+ token = "when'd"
10
+ following_token = nil
11
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
12
+ expect(ws.contraction?).to eq(true)
13
+ end
14
+
15
+ it 'returns true if the token is an irregular contraction' do
16
+ token = "o'clock"
17
+ following_token = nil
18
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
19
+ expect(ws.contraction?).to eq(true)
20
+ end
21
+
22
+ it 'returns false if the token is a possesive and not a contraction' do
23
+ token = "Bob's"
24
+ following_token = "car"
25
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
26
+ expect(ws.contraction?).to eq(false)
27
+ end
28
+
29
+ it 'returns true if the token is a contraction' do
30
+ token = "Bob's"
31
+ following_token = "the"
32
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
33
+ expect(ws.contraction?).to eq(true)
34
+ end
35
+
36
+ it 'returns true if the token is a contraction' do
37
+ token = "Bob's"
38
+ following_token = "open"
39
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
40
+ expect(ws.contraction?).to eq(true)
41
+ end
42
+
43
+ it 'returns true if the token is a contraction' do
44
+ token = "Don't"
45
+ following_token = "count"
46
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
47
+ expect(ws.contraction?).to eq(true)
48
+ end
49
+ end
50
+
51
+ context '#expanded_count' do
52
+ it 'returns the count of the contraction expanded #001' do
53
+ token = "when'd"
54
+ following_token = nil
55
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
56
+ expect(ws.expanded_count).to eq(2)
57
+ end
58
+
59
+ it 'returns the count of the contraction expanded #002' do
60
+ token = "o'clock"
61
+ following_token = nil
62
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
63
+ expect(ws.expanded_count).to eq(3)
64
+ end
65
+
66
+ it 'returns the count of the contraction expanded #003' do
67
+ token = "Bob's"
68
+ following_token = "car"
69
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
70
+ expect(ws.expanded_count).to eq(1)
71
+ end
72
+
73
+ it 'returns the count of the contraction expanded #004' do
74
+ token = "Bob's"
75
+ following_token = "the"
76
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
77
+ expect(ws.expanded_count).to eq(2)
78
+ end
79
+
80
+ it 'returns the count of the contraction expanded #005' do
81
+ token = "cat-o'-nine-tails"
82
+ following_token = nil
83
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_one')
84
+ expect(ws.expanded_count).to eq(1)
85
+ end
86
+
87
+ it 'returns the count of the contraction expanded #006' do
88
+ token = "cat-o'-nine-tails"
89
+ following_token = nil
90
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_multiple')
91
+ expect(ws.expanded_count).to eq(4)
92
+ end
93
+ end
94
+
95
+ context '#replace' do
96
+ it 'replaces the token with the contraction expanded #001' do
97
+ token = "cat-o'-nine-tails"
98
+ following_token = nil
99
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
100
+ expect(ws.replace).to eq("cat-of-nine-tails")
101
+ end
102
+
103
+ it 'replaces the token with the contraction expanded #002' do
104
+ token = "Bob's"
105
+ following_token = "the"
106
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
107
+ expect(ws.replace).to eq(" word word ")
108
+ end
109
+
110
+ it 'replaces the token with the contraction expanded #003' do
111
+ token = "don't"
112
+ following_token = nil
113
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
114
+ expect(ws.replace).to eq("do not")
115
+ end
116
+
117
+ it 'replaces the token with the contraction expanded #004' do
118
+ token = "hello"
119
+ following_token = nil
120
+ ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
121
+ expect(ws.replace).to eq("hello")
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,647 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe WordCountAnalyzer::Counter do
4
+ context 'ellipsis' do
5
+ it 'handles an invalid ellipsis argument value' do
6
+ text = 'hello world.'
7
+ ws = WordCountAnalyzer::Counter.new(text: text, ellipsis: 'hello')
8
+ expect { ws.count }.to raise_error('The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`')
9
+ end
10
+
11
+ it 'ignores ellipses in the word count' do
12
+ text = 'hello world ... what day is it.'
13
+ ws = WordCountAnalyzer::Counter.new(
14
+ text: text,
15
+ ellipsis: 'ignore'
16
+ )
17
+ expect(ws.count).to eq(6)
18
+ end
19
+
20
+ it 'does not ignore ellipses in the word count' do
21
+ text = 'hello world ... what day is it.'
22
+ ws = WordCountAnalyzer::Counter.new(
23
+ text: text,
24
+ ellipsis: 'no_special_treatment'
25
+ )
26
+ expect(ws.count).to eq(7)
27
+ end
28
+
29
+ it 'does not ignore ellipses in the word count' do
30
+ text = 'hello world... what day is it.'
31
+ ws = WordCountAnalyzer::Counter.new(
32
+ text: text,
33
+ ellipsis: 'no_special_treatment'
34
+ )
35
+ expect(ws.count).to eq(6)
36
+ end
37
+
38
+ it 'sets ignore as the default option' do
39
+ text = 'hello world ... what day is it.'
40
+ ws = WordCountAnalyzer::Counter.new(text: text)
41
+ expect(ws.count).to eq(6)
42
+ end
43
+ end
44
+
45
+ context 'hyperlink' do
46
+ it 'handles an invalid hyperlink argument value' do
47
+ text = 'hello world.'
48
+ ws = WordCountAnalyzer::Counter.new(text: text, hyperlink: 'hello')
49
+ expect { ws.count }.to raise_error('The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`')
50
+ end
51
+
52
+ it 'counts a hyperlink as one word in the word count' do
53
+ text = 'The site address is http://www.example.com she said.'
54
+ ws = WordCountAnalyzer::Counter.new(
55
+ text: text,
56
+ hyperlink: 'count_as_one'
57
+ )
58
+ expect(ws.count).to eq(7)
59
+ end
60
+
61
+ it 'counts a hyperlink as one word in the word count' do
62
+ text = 'The site address is http://www.example.com she said.'
63
+ ws = WordCountAnalyzer::Counter.new(
64
+ text: text,
65
+ hyperlink: 'split_at_period',
66
+ forward_slash: 'count_as_one'
67
+ )
68
+ expect(ws.count).to eq(9)
69
+ end
70
+
71
+ it 'does not search for hyperlinks' do
72
+ text = 'The site address is http://www.example.com she said.'
73
+ ws = WordCountAnalyzer::Counter.new(
74
+ text: text,
75
+ hyperlink: 'no_special_treatment'
76
+ )
77
+ expect(ws.count).to eq(8)
78
+ end
79
+
80
+ it 'sets count_as_one as the default option' do
81
+ text = 'The site address is http://www.example.com she said.'
82
+ ws = WordCountAnalyzer::Counter.new(text: text)
83
+ expect(ws.count).to eq(7)
84
+ end
85
+ end
86
+
87
+ context 'contraction' do
88
+ it 'handles an invalid contraction argument value' do
89
+ text = 'hello world.'
90
+ ws = WordCountAnalyzer::Counter.new(text: text, contraction: 'hello')
91
+ expect { ws.count }.to raise_error('The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
92
+ end
93
+
94
+ it 'counts a contraction as one word in the word count' do
95
+ text = "Don't do that."
96
+ ws = WordCountAnalyzer::Counter.new(
97
+ text: text,
98
+ contraction: 'count_as_one'
99
+ )
100
+ expect(ws.count).to eq(3)
101
+ end
102
+
103
+ it 'splits a contraction into its separate words for the word count' do
104
+ text = "Don't do that."
105
+ ws = WordCountAnalyzer::Counter.new(
106
+ text: text,
107
+ contraction: 'count_as_multiple'
108
+ )
109
+ expect(ws.count).to eq(4)
110
+ end
111
+
112
+ it 'sets count_as_one as the default option' do
113
+ text = "Don't do that."
114
+ ws = WordCountAnalyzer::Counter.new(text: text)
115
+ expect(ws.count).to eq(3)
116
+ end
117
+ end
118
+
119
+ context 'hyphenated_word' do
120
+ it 'handles an invalid hyphenated_word argument value' do
121
+ text = 'hello world.'
122
+ ws = WordCountAnalyzer::Counter.new(text: text, hyphenated_word: 'hello')
123
+ expect { ws.count }.to raise_error('The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
124
+ end
125
+
126
+ it 'counts a hyphenated word as one word in the word count' do
127
+ text = 'He has a devil-may-care attitude.'
128
+ ws = WordCountAnalyzer::Counter.new(
129
+ text: text,
130
+ hyphenated_word: 'count_as_one'
131
+ )
132
+ expect(ws.count).to eq(5)
133
+ end
134
+
135
+ it 'splits a hyphenated word into its separate words for the word count' do
136
+ text = 'He has a devil-may-care attitude.'
137
+ ws = WordCountAnalyzer::Counter.new(
138
+ text: text,
139
+ hyphenated_word: 'count_as_multiple'
140
+ )
141
+ expect(ws.count).to eq(7)
142
+ end
143
+
144
+ it 'sets count_as_one as the default option' do
145
+ text = 'He has a devil-may-care attitude.'
146
+ ws = WordCountAnalyzer::Counter.new(text: text)
147
+ expect(ws.count).to eq(5)
148
+ end
149
+ end
150
+
151
+ context 'date' do
152
+ it 'handles an invalid date argument value' do
153
+ text = 'hello world.'
154
+ ws = WordCountAnalyzer::Counter.new(text: text, date: 'hello')
155
+ expect { ws.count }.to raise_error('The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`')
156
+ end
157
+
158
+ it 'ignores date placeables' do
159
+ text = 'Today is Tues. March 3rd, 2011.'
160
+ ws = WordCountAnalyzer::Counter.new(
161
+ text: text,
162
+ date: 'no_special_treatment'
163
+ )
164
+ expect(ws.count).to eq(6)
165
+ end
166
+
167
+ it 'counts a date placeable as one word in the word count' do
168
+ text = 'Today is Tues. March 3rd, 2011.'
169
+ ws = WordCountAnalyzer::Counter.new(
170
+ text: text,
171
+ date: 'count_as_one'
172
+ )
173
+ expect(ws.count).to eq(3)
174
+ end
175
+
176
+ it 'sets count_as_one as the default option' do
177
+ text = 'Today is Tues. March 3rd, 2011.'
178
+ ws = WordCountAnalyzer::Counter.new(text: text)
179
+ expect(ws.count).to eq(6)
180
+ end
181
+ end
182
+
183
+ context 'number' do
184
+ it 'handles an invalid number argument value' do
185
+ text = 'hello world.'
186
+ ws = WordCountAnalyzer::Counter.new(text: text, number: 'hello')
187
+ expect { ws.count }.to raise_error('The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
188
+ end
189
+
190
+ it 'counts a number as a word' do
191
+ text = 'There is $300 in the safe. The password is 1234.'
192
+ ws = WordCountAnalyzer::Counter.new(
193
+ text: text,
194
+ number: 'count'
195
+ )
196
+ expect(ws.count).to eq(10)
197
+ end
198
+
199
+ it 'ignores numbers in the word count' do
200
+ text = 'There is $300 in the safe. The password is 1234.'
201
+ ws = WordCountAnalyzer::Counter.new(
202
+ text: text,
203
+ number: 'ignore'
204
+ )
205
+ expect(ws.count).to eq(8)
206
+ end
207
+
208
+ it 'sets count as the default option' do
209
+ text = 'There is $300 in the safe. The password is 1234.'
210
+ ws = WordCountAnalyzer::Counter.new(text: text)
211
+ expect(ws.count).to eq(10)
212
+ end
213
+ end
214
+
215
+ context 'number_list' do
216
+ it 'handles an invalid number argument value' do
217
+ text = 'hello world.'
218
+ ws = WordCountAnalyzer::Counter.new(text: text, numbered_list: 'hello')
219
+ expect { ws.count }.to raise_error('The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
220
+ end
221
+
222
+ it 'counts a numbered list number as a word' do
223
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
224
+ ws = WordCountAnalyzer::Counter.new(
225
+ text: text,
226
+ numbered_list: 'count'
227
+ )
228
+ expect(ws.count).to eq(17)
229
+ end
230
+
231
+ it 'ignores numbered list numbers' do
232
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
233
+ ws = WordCountAnalyzer::Counter.new(
234
+ text: text,
235
+ numbered_list: 'ignore'
236
+ )
237
+ expect(ws.count).to eq(14)
238
+ end
239
+
240
+ it 'sets count as the default option' do
241
+ text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
242
+ ws = WordCountAnalyzer::Counter.new(text: text)
243
+ expect(ws.count).to eq(17)
244
+ end
245
+ end
246
+
247
+ context 'xhtml' do
248
+ it 'handles an invalid number argument value' do
249
+ text = 'hello world.'
250
+ ws = WordCountAnalyzer::Counter.new(text: text, xhtml: 'hello')
251
+ expect { ws.count }.to raise_error('The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`')
252
+ end
253
+
254
+ it 'removes all xhtml from the text' do
255
+ text = "<span class='orange-text'>Hello world</span>"
256
+ ws = WordCountAnalyzer::Counter.new(
257
+ text: text,
258
+ xhtml: 'remove'
259
+ )
260
+ expect(ws.count).to eq(2)
261
+ end
262
+
263
+ it 'keeps xhtml in the text' do
264
+ text = "<span class='orange-text'>Hello world</span>"
265
+ ws = WordCountAnalyzer::Counter.new(
266
+ text: text,
267
+ xhtml: 'keep',
268
+ forward_slash: 'count_as_one'
269
+ )
270
+ expect(ws.count).to eq(3)
271
+ end
272
+
273
+ it 'keeps xhtml in the text' do
274
+ text = "<span class='orange-text'>Hello world</span>"
275
+ ws = WordCountAnalyzer::Counter.new(
276
+ text: text,
277
+ xhtml: 'keep'
278
+ )
279
+ expect(ws.count).to eq(4)
280
+ end
281
+
282
+ it 'sets remove as the default option' do
283
+ text = "<span class='orange-text'>Hello world</span>"
284
+ ws = WordCountAnalyzer::Counter.new(text: text)
285
+ expect(ws.count).to eq(2)
286
+ end
287
+ end
288
+
289
+ context 'forward_slash' do
290
+ it 'handles an invalid number argument value' do
291
+ text = 'hello world.'
292
+ ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'hello')
293
+ expect { ws.count }.to raise_error('The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`')
294
+ end
295
+
296
+ it 'counts a forward slash as multiple words (except dates) #001' do
297
+ text = "She/he/it said hello. 4/22/2013."
298
+ ws = WordCountAnalyzer::Counter.new(
299
+ text: text,
300
+ forward_slash: 'count_as_multiple_except_dates'
301
+ )
302
+ expect(ws.count).to eq(6)
303
+ end
304
+
305
+ it 'counts a forward slash as multiple words #002' do
306
+ text = "She/he/it said hello. 4/22/2013."
307
+ ws = WordCountAnalyzer::Counter.new(
308
+ text: text,
309
+ forward_slash: 'count_as_multiple'
310
+ )
311
+ expect(ws.count).to eq(8)
312
+ end
313
+
314
+ it 'counts a forward slash as multiple words #003' do
315
+ text = "She/he/it said hello. 4/22/2013."
316
+ ws = WordCountAnalyzer::Counter.new(
317
+ text: text,
318
+ forward_slash: 'count_as_multiple',
319
+ date: 'count_as_one'
320
+ )
321
+ expect(ws.count).to eq(6)
322
+ end
323
+
324
+ it 'counts a forward slash as one word' do
325
+ text = "She/he/it said hello."
326
+ ws = WordCountAnalyzer::Counter.new(
327
+ text: text,
328
+ forward_slash: 'count_as_one'
329
+ )
330
+ expect(ws.count).to eq(3)
331
+ end
332
+
333
+ it 'sets count_as_multiple_except_dates as the default option' do
334
+ text = "She/he/it said hello. 4/22/2013."
335
+ ws = WordCountAnalyzer::Counter.new(text: text)
336
+ expect(ws.count).to eq(6)
337
+ end
338
+ end
339
+
340
+ context 'backslash' do
341
+ it 'handles an invalid number argument value' do
342
+ text = 'hello world.'
343
+ ws = WordCountAnalyzer::Counter.new(text: text, backslash: 'hello')
344
+ expect { ws.count }.to raise_error('The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
345
+ end
346
+
347
+ it 'counts a token with backslashes as one word' do
348
+ text = 'The file location is c:\Users\johndoe'
349
+ ws = WordCountAnalyzer::Counter.new(
350
+ text: text,
351
+ backslash: 'count_as_one'
352
+ )
353
+ expect(ws.count).to eq(5)
354
+ end
355
+
356
+ it 'counts a token with backslashes as multiple words' do
357
+ text = 'The file location is c:\Users\johndoe'
358
+ ws = WordCountAnalyzer::Counter.new(
359
+ text: text,
360
+ backslash: 'count_as_multiple'
361
+ )
362
+ expect(ws.count).to eq(7)
363
+ end
364
+
365
+ it 'sets count_as_one as the default option' do
366
+ text = 'The file location is c:\Users\johndoe'
367
+ ws = WordCountAnalyzer::Counter.new(text: text)
368
+ expect(ws.count).to eq(5)
369
+ end
370
+ end
371
+
372
+ context 'dotted_line' do
373
+ it 'handles an invalid number argument value' do
374
+ text = 'hello world.'
375
+ ws = WordCountAnalyzer::Counter.new(text: text, dotted_line: 'hello')
376
+ expect { ws.count }.to raise_error('The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
377
+ end
378
+
379
+ it 'ignores continuous strings of dots in the word count' do
380
+ text = 'Here is one …………………………………………………………………… and another ......'
381
+ ws = WordCountAnalyzer::Counter.new(
382
+ text: text,
383
+ dotted_line: 'ignore'
384
+ )
385
+ expect(ws.count).to eq(5)
386
+ end
387
+
388
+ it 'counts a continuous string of dots as a word' do
389
+ text = 'Here is one …………………………………………………………………… and another ......'
390
+ ws = WordCountAnalyzer::Counter.new(
391
+ text: text,
392
+ dotted_line: 'count'
393
+ )
394
+ expect(ws.count).to eq(7)
395
+ end
396
+
397
+ it 'sets ignore as the default option' do
398
+ text = 'Here is one …………………………………………………………………… and another ......'
399
+ ws = WordCountAnalyzer::Counter.new(text: text)
400
+ expect(ws.count).to eq(5)
401
+ end
402
+ end
403
+
404
+ context 'dashed_line' do
405
+ it 'handles an invalid number argument value' do
406
+ text = 'hello world.'
407
+ ws = WordCountAnalyzer::Counter.new(text: text, dashed_line: 'hello')
408
+ expect { ws.count }.to raise_error('The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
409
+ end
410
+
411
+ it 'ignores continuous strings of dashes in the word count' do
412
+ text = 'Here is one ----- and another -----'
413
+ ws = WordCountAnalyzer::Counter.new(
414
+ text: text,
415
+ dashed_line: 'ignore'
416
+ )
417
+ expect(ws.count).to eq(5)
418
+ end
419
+
420
+ it 'counts a continuous string of dashes as a word' do
421
+ text = 'Here is one ----- and another -----'
422
+ ws = WordCountAnalyzer::Counter.new(
423
+ text: text,
424
+ dashed_line: 'count'
425
+ )
426
+ expect(ws.count).to eq(7)
427
+ end
428
+
429
+ it 'sets ignore as the default option' do
430
+ text = 'Here is one ----- and another -----'
431
+ ws = WordCountAnalyzer::Counter.new(text: text)
432
+ expect(ws.count).to eq(5)
433
+ end
434
+ end
435
+
436
+ context 'underscore' do
437
+ it 'handles an invalid number argument value' do
438
+ text = 'hello world.'
439
+ ws = WordCountAnalyzer::Counter.new(text: text, underscore: 'hello')
440
+ expect { ws.count }.to raise_error('The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
441
+ end
442
+
443
+ it 'ignores continuous strings of underscores in the word count' do
444
+ text = "Here is one ______ and another ______"
445
+ ws = WordCountAnalyzer::Counter.new(
446
+ text: text,
447
+ underscore: 'ignore'
448
+ )
449
+ expect(ws.count).to eq(5)
450
+ end
451
+
452
+ it 'counts a continuous string of underscores as a word' do
453
+ text = 'Here is one ______ and another ______'
454
+ ws = WordCountAnalyzer::Counter.new(
455
+ text: text,
456
+ underscore: 'count'
457
+ )
458
+ expect(ws.count).to eq(7)
459
+ end
460
+
461
+ it 'sets ignore as the default option' do
462
+ text = 'Here is one ______ and another ______'
463
+ ws = WordCountAnalyzer::Counter.new(text: text)
464
+ expect(ws.count).to eq(5)
465
+ end
466
+ end
467
+
468
+ context 'stray_punctuation' do
469
+ it 'handles an invalid number argument value' do
470
+ text = 'hello world.'
471
+ ws = WordCountAnalyzer::Counter.new(text: text, stray_punctuation: 'hello')
472
+ expect { ws.count }.to raise_error('The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
473
+ end
474
+
475
+ it 'ignores continuous strings of underscores in the word count' do
476
+ text = 'Hello world ? This is another - sentence .'
477
+ ws = WordCountAnalyzer::Counter.new(
478
+ text: text,
479
+ stray_punctuation: 'ignore'
480
+ )
481
+ expect(ws.count).to eq(6)
482
+ end
483
+
484
+ it 'counts a continuous string of underscores as a word' do
485
+ text = 'Hello world ? This is another - sentence .'
486
+ ws = WordCountAnalyzer::Counter.new(
487
+ text: text,
488
+ stray_punctuation: 'count'
489
+ )
490
+ expect(ws.count).to eq(9)
491
+ end
492
+
493
+ it 'sets ignore as the default option' do
494
+ text = 'Hello world ? This is another - sentence .'
495
+ ws = WordCountAnalyzer::Counter.new(text: text)
496
+ expect(ws.count).to eq(6)
497
+ end
498
+ end
499
+
500
+ it 'counts the words in a string #001' do
501
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
502
+ ws = WordCountAnalyzer::Counter.new(
503
+ text: text,
504
+ ellipsis: 'ignore',
505
+ hyperlink: 'count_as_one',
506
+ contraction: 'count_as_one',
507
+ hyphenated_word: 'count_as_one',
508
+ date: 'no_special_treatment',
509
+ number: 'count',
510
+ numbered_list: 'count',
511
+ xhtml: 'remove',
512
+ forward_slash: 'count_as_one',
513
+ backslash: 'count_as_one',
514
+ dotted_line: 'ignore',
515
+ dashed_line: 'ignore',
516
+ underscore: 'ignore',
517
+ stray_punctuation: 'ignore'
518
+ )
519
+ expect(ws.count).to eq(62)
520
+ end
521
+
522
+ it 'counts the words in a string #002' do
523
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
524
+ ws = WordCountAnalyzer::Counter.new(
525
+ text: text,
526
+ ellipsis: 'no_special_treatment',
527
+ hyperlink: 'no_special_treatment',
528
+ contraction: 'count_as_multiple',
529
+ hyphenated_word: 'count_as_multiple',
530
+ date: 'count_as_one',
531
+ number: 'ignore',
532
+ numbered_list: 'ignore',
533
+ xhtml: 'keep',
534
+ forward_slash: 'count_as_multiple',
535
+ backslash: 'count_as_multiple',
536
+ dotted_line: 'count',
537
+ dashed_line: 'count',
538
+ underscore: 'count',
539
+ stray_punctuation: 'count'
540
+ )
541
+ expect(ws.count).to eq(77)
542
+ end
543
+
544
+ it 'counts the words in a string #003' do
545
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
546
+ ws = WordCountAnalyzer::Counter.new(text: text)
547
+ expect(ws.count).to eq(64)
548
+ end
549
+
550
+ it 'counts the words in a string #004' do
551
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
552
+ ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'count_as_multiple')
553
+ expect(ws.count).to eq(66)
554
+ end
555
+
556
+ context 'Pages Word Count' do
557
+ it 'reverse engineers Pages word count #001' do
558
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
559
+ ws = WordCountAnalyzer::Counter.new(
560
+ text: text,
561
+ ellipsis: 'no_special_treatment',
562
+ hyperlink: 'split_at_period',
563
+ contraction: 'count_as_one',
564
+ hyphenated_word: 'count_as_multiple',
565
+ date: 'no_special_treatment',
566
+ number: 'count',
567
+ numbered_list: 'count',
568
+ xhtml: 'keep',
569
+ forward_slash: 'count_as_multiple',
570
+ backslash: 'count_as_multiple',
571
+ dotted_line: 'ignore',
572
+ dashed_line: 'ignore',
573
+ underscore: 'ignore',
574
+ stray_punctuation: 'ignore'
575
+ )
576
+ expect(ws.count).to eq(79)
577
+ end
578
+
579
+ it 'reverse engineers Pages word count #002' do
580
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
581
+ ws = WordCountAnalyzer::Counter.new(text: text)
582
+ expect(ws.pages_count).to eq(79)
583
+ end
584
+ end
585
+
586
+ context 'Microsoft Word Count' do
587
+ it 'reverse engineers the Microsoft Word / wc (Unix) word count #001' do
588
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
589
+ ws = WordCountAnalyzer::Counter.new(
590
+ text: text,
591
+ ellipsis: 'no_special_treatment',
592
+ hyperlink: 'count_as_one',
593
+ contraction: 'count_as_one',
594
+ hyphenated_word: 'count_as_one',
595
+ date: 'no_special_treatment',
596
+ number: 'count',
597
+ numbered_list: 'count',
598
+ xhtml: 'keep',
599
+ forward_slash: 'count_as_one',
600
+ backslash: 'count_as_one',
601
+ dotted_line: 'count',
602
+ dashed_line: 'count',
603
+ underscore: 'count',
604
+ stray_punctuation: 'count'
605
+ )
606
+ expect(ws.count).to eq(71)
607
+ end
608
+
609
+ it 'reverse engineers the Microsoft Word / wc (Unix) word count #002' do
610
+ text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
611
+ ws = WordCountAnalyzer::Counter.new(text: text)
612
+ expect(ws.mword_count).to eq(71)
613
+ end
614
+ end
615
+
616
+ context 'example sentences' do
617
+ it 'String with common words (no edge cases) #001' do
618
+ ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains nothing crazy.')
619
+ expect(ws.count).to eq(5)
620
+ end
621
+
622
+ it 'String with a number #002' do
623
+ ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains 1 number.')
624
+ expect(ws.count).to eq(5)
625
+ end
626
+
627
+ it 'String with a date #003' do
628
+ ws = WordCountAnalyzer::Counter.new(text: 'Today is Monday, April 4th, 2011.')
629
+ expect(ws.count).to eq(6)
630
+ end
631
+
632
+ it 'does not split on unicode chars' do
633
+ ws = WordCountAnalyzer::Counter.new(text: 'São Paulo')
634
+ expect(ws.count).to eq(2)
635
+ end
636
+
637
+ it 'should not count HTML tags' do
638
+ ws = WordCountAnalyzer::Counter.new(text: "<a href=\"http://thefamousfox.com\">the brown fox</a> jumped over the lazy dog")
639
+ expect(ws.count).to eq(8)
640
+ end
641
+
642
+ it 'should handle special characters' do
643
+ ws = WordCountAnalyzer::Counter.new(text: "the \"brown\" fox 'jumped' | over \\ the / lazy dog")
644
+ expect(ws.count).to eq(8)
645
+ end
646
+ end
647
+ end