word_count_analyzer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +554 -0
- data/Rakefile +2 -0
- data/lib/word_count_analyzer.rb +14 -0
- data/lib/word_count_analyzer/analyzer.rb +34 -0
- data/lib/word_count_analyzer/contraction.rb +176 -0
- data/lib/word_count_analyzer/counter.rb +230 -0
- data/lib/word_count_analyzer/date.rb +149 -0
- data/lib/word_count_analyzer/ellipsis.rb +48 -0
- data/lib/word_count_analyzer/hyperlink.rb +53 -0
- data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
- data/lib/word_count_analyzer/number.rb +23 -0
- data/lib/word_count_analyzer/numbered_list.rb +61 -0
- data/lib/word_count_analyzer/punctuation.rb +52 -0
- data/lib/word_count_analyzer/slash.rb +84 -0
- data/lib/word_count_analyzer/version.rb +3 -0
- data/lib/word_count_analyzer/xhtml.rb +26 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
- data/spec/word_count_analyzer/contraction_spec.rb +124 -0
- data/spec/word_count_analyzer/counter_spec.rb +647 -0
- data/spec/word_count_analyzer/date_spec.rb +257 -0
- data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
- data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
- data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
- data/spec/word_count_analyzer/number_spec.rb +63 -0
- data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
- data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
- data/spec/word_count_analyzer/slash_spec.rb +105 -0
- data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
- data/word_count_analyzer.gemspec +26 -0
- metadata +153 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Xhtml
|
3
|
+
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
4
|
+
XHTML_REGEX = /<\/?[^>]*>/
|
5
|
+
attr_reader :string
|
6
|
+
def initialize(string:)
|
7
|
+
@string = string
|
8
|
+
end
|
9
|
+
|
10
|
+
def includes_xhtml?
|
11
|
+
!(string !~ XHTML_REGEX)
|
12
|
+
end
|
13
|
+
|
14
|
+
def replace
|
15
|
+
string.gsub(XHTML_REGEX, ' ')
|
16
|
+
end
|
17
|
+
|
18
|
+
def count_difference_word_boundary
|
19
|
+
string.split(/\s+/).size - replace.strip.split(/\s+/).size
|
20
|
+
end
|
21
|
+
|
22
|
+
def occurences
|
23
|
+
string.gsub(XHTML_REGEX, ' wsxhtmlword ').scan(/wsxhtmlword/).size / 2
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'word_count_analyzer'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe WordCountAnalyzer::Analyzer do
|
4
|
+
context '#analysis' do
|
5
|
+
it 'should analyze the gray areas' do
|
6
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
7
|
+
ws = WordCountAnalyzer::Analyzer.new(text: text)
|
8
|
+
expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>2, "contraction"=>4, "hyphenated_word"=>2, "date"=>2, "number"=>1, "numbered_list"=>3, "xhtml"=>1, "forward_slash"=>1, "backslash"=>1, "dotted_line"=>1, "dashed_line"=>1, "underscore"=>1, "stray_punctuation"=>5})
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe WordCountAnalyzer::Contraction do
|
4
|
+
before do
|
5
|
+
@tgr = EngTagger.new
|
6
|
+
end
|
7
|
+
context '#contraction?' do
|
8
|
+
it 'returns true if the token is a contraction' do
|
9
|
+
token = "when'd"
|
10
|
+
following_token = nil
|
11
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
12
|
+
expect(ws.contraction?).to eq(true)
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'returns true if the token is an irregular contraction' do
|
16
|
+
token = "o'clock"
|
17
|
+
following_token = nil
|
18
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
19
|
+
expect(ws.contraction?).to eq(true)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'returns false if the token is a possesive and not a contraction' do
|
23
|
+
token = "Bob's"
|
24
|
+
following_token = "car"
|
25
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
26
|
+
expect(ws.contraction?).to eq(false)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'returns true if the token is a contraction' do
|
30
|
+
token = "Bob's"
|
31
|
+
following_token = "the"
|
32
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
33
|
+
expect(ws.contraction?).to eq(true)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'returns true if the token is a contraction' do
|
37
|
+
token = "Bob's"
|
38
|
+
following_token = "open"
|
39
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
40
|
+
expect(ws.contraction?).to eq(true)
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'returns true if the token is a contraction' do
|
44
|
+
token = "Don't"
|
45
|
+
following_token = "count"
|
46
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
47
|
+
expect(ws.contraction?).to eq(true)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context '#expanded_count' do
|
52
|
+
it 'returns the count of the contraction expanded #001' do
|
53
|
+
token = "when'd"
|
54
|
+
following_token = nil
|
55
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
56
|
+
expect(ws.expanded_count).to eq(2)
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'returns the count of the contraction expanded #002' do
|
60
|
+
token = "o'clock"
|
61
|
+
following_token = nil
|
62
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
63
|
+
expect(ws.expanded_count).to eq(3)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'returns the count of the contraction expanded #003' do
|
67
|
+
token = "Bob's"
|
68
|
+
following_token = "car"
|
69
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
70
|
+
expect(ws.expanded_count).to eq(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'returns the count of the contraction expanded #004' do
|
74
|
+
token = "Bob's"
|
75
|
+
following_token = "the"
|
76
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
|
77
|
+
expect(ws.expanded_count).to eq(2)
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'returns the count of the contraction expanded #005' do
|
81
|
+
token = "cat-o'-nine-tails"
|
82
|
+
following_token = nil
|
83
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_one')
|
84
|
+
expect(ws.expanded_count).to eq(1)
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'returns the count of the contraction expanded #006' do
|
88
|
+
token = "cat-o'-nine-tails"
|
89
|
+
following_token = nil
|
90
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_multiple')
|
91
|
+
expect(ws.expanded_count).to eq(4)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
context '#replace' do
|
96
|
+
it 'replaces the token with the contraction expanded #001' do
|
97
|
+
token = "cat-o'-nine-tails"
|
98
|
+
following_token = nil
|
99
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
|
100
|
+
expect(ws.replace).to eq("cat-of-nine-tails")
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'replaces the token with the contraction expanded #002' do
|
104
|
+
token = "Bob's"
|
105
|
+
following_token = "the"
|
106
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
|
107
|
+
expect(ws.replace).to eq(" word word ")
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'replaces the token with the contraction expanded #003' do
|
111
|
+
token = "don't"
|
112
|
+
following_token = nil
|
113
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
|
114
|
+
expect(ws.replace).to eq("do not")
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'replaces the token with the contraction expanded #004' do
|
118
|
+
token = "hello"
|
119
|
+
following_token = nil
|
120
|
+
ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
|
121
|
+
expect(ws.replace).to eq("hello")
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,647 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe WordCountAnalyzer::Counter do
|
4
|
+
context 'ellipsis' do
|
5
|
+
it 'handles an invalid ellipsis argument value' do
|
6
|
+
text = 'hello world.'
|
7
|
+
ws = WordCountAnalyzer::Counter.new(text: text, ellipsis: 'hello')
|
8
|
+
expect { ws.count }.to raise_error('The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'ignores ellipses in the word count' do
|
12
|
+
text = 'hello world ... what day is it.'
|
13
|
+
ws = WordCountAnalyzer::Counter.new(
|
14
|
+
text: text,
|
15
|
+
ellipsis: 'ignore'
|
16
|
+
)
|
17
|
+
expect(ws.count).to eq(6)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'does not ignore ellipses in the word count' do
|
21
|
+
text = 'hello world ... what day is it.'
|
22
|
+
ws = WordCountAnalyzer::Counter.new(
|
23
|
+
text: text,
|
24
|
+
ellipsis: 'no_special_treatment'
|
25
|
+
)
|
26
|
+
expect(ws.count).to eq(7)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'does not ignore ellipses in the word count' do
|
30
|
+
text = 'hello world... what day is it.'
|
31
|
+
ws = WordCountAnalyzer::Counter.new(
|
32
|
+
text: text,
|
33
|
+
ellipsis: 'no_special_treatment'
|
34
|
+
)
|
35
|
+
expect(ws.count).to eq(6)
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'sets ignore as the default option' do
|
39
|
+
text = 'hello world ... what day is it.'
|
40
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
41
|
+
expect(ws.count).to eq(6)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'hyperlink' do
|
46
|
+
it 'handles an invalid hyperlink argument value' do
|
47
|
+
text = 'hello world.'
|
48
|
+
ws = WordCountAnalyzer::Counter.new(text: text, hyperlink: 'hello')
|
49
|
+
expect { ws.count }.to raise_error('The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`')
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'counts a hyperlink as one word in the word count' do
|
53
|
+
text = 'The site address is http://www.example.com she said.'
|
54
|
+
ws = WordCountAnalyzer::Counter.new(
|
55
|
+
text: text,
|
56
|
+
hyperlink: 'count_as_one'
|
57
|
+
)
|
58
|
+
expect(ws.count).to eq(7)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'counts a hyperlink as one word in the word count' do
|
62
|
+
text = 'The site address is http://www.example.com she said.'
|
63
|
+
ws = WordCountAnalyzer::Counter.new(
|
64
|
+
text: text,
|
65
|
+
hyperlink: 'split_at_period',
|
66
|
+
forward_slash: 'count_as_one'
|
67
|
+
)
|
68
|
+
expect(ws.count).to eq(9)
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'does not search for hyperlinks' do
|
72
|
+
text = 'The site address is http://www.example.com she said.'
|
73
|
+
ws = WordCountAnalyzer::Counter.new(
|
74
|
+
text: text,
|
75
|
+
hyperlink: 'no_special_treatment'
|
76
|
+
)
|
77
|
+
expect(ws.count).to eq(8)
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'sets count_as_one as the default option' do
|
81
|
+
text = 'The site address is http://www.example.com she said.'
|
82
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
83
|
+
expect(ws.count).to eq(7)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
context 'contraction' do
|
88
|
+
it 'handles an invalid contraction argument value' do
|
89
|
+
text = 'hello world.'
|
90
|
+
ws = WordCountAnalyzer::Counter.new(text: text, contraction: 'hello')
|
91
|
+
expect { ws.count }.to raise_error('The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'counts a contraction as one word in the word count' do
|
95
|
+
text = "Don't do that."
|
96
|
+
ws = WordCountAnalyzer::Counter.new(
|
97
|
+
text: text,
|
98
|
+
contraction: 'count_as_one'
|
99
|
+
)
|
100
|
+
expect(ws.count).to eq(3)
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'splits a contraction into its separate words for the word count' do
|
104
|
+
text = "Don't do that."
|
105
|
+
ws = WordCountAnalyzer::Counter.new(
|
106
|
+
text: text,
|
107
|
+
contraction: 'count_as_multiple'
|
108
|
+
)
|
109
|
+
expect(ws.count).to eq(4)
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'sets count_as_one as the default option' do
|
113
|
+
text = "Don't do that."
|
114
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
115
|
+
expect(ws.count).to eq(3)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
context 'hyphenated_word' do
|
120
|
+
it 'handles an invalid hyphenated_word argument value' do
|
121
|
+
text = 'hello world.'
|
122
|
+
ws = WordCountAnalyzer::Counter.new(text: text, hyphenated_word: 'hello')
|
123
|
+
expect { ws.count }.to raise_error('The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'counts a hyphenated word as one word in the word count' do
|
127
|
+
text = 'He has a devil-may-care attitude.'
|
128
|
+
ws = WordCountAnalyzer::Counter.new(
|
129
|
+
text: text,
|
130
|
+
hyphenated_word: 'count_as_one'
|
131
|
+
)
|
132
|
+
expect(ws.count).to eq(5)
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'splits a hyphenated word into its separate words for the word count' do
|
136
|
+
text = 'He has a devil-may-care attitude.'
|
137
|
+
ws = WordCountAnalyzer::Counter.new(
|
138
|
+
text: text,
|
139
|
+
hyphenated_word: 'count_as_multiple'
|
140
|
+
)
|
141
|
+
expect(ws.count).to eq(7)
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'sets count_as_one as the default option' do
|
145
|
+
text = 'He has a devil-may-care attitude.'
|
146
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
147
|
+
expect(ws.count).to eq(5)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
context 'date' do
|
152
|
+
it 'handles an invalid date argument value' do
|
153
|
+
text = 'hello world.'
|
154
|
+
ws = WordCountAnalyzer::Counter.new(text: text, date: 'hello')
|
155
|
+
expect { ws.count }.to raise_error('The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`')
|
156
|
+
end
|
157
|
+
|
158
|
+
it 'ignores date placeables' do
|
159
|
+
text = 'Today is Tues. March 3rd, 2011.'
|
160
|
+
ws = WordCountAnalyzer::Counter.new(
|
161
|
+
text: text,
|
162
|
+
date: 'no_special_treatment'
|
163
|
+
)
|
164
|
+
expect(ws.count).to eq(6)
|
165
|
+
end
|
166
|
+
|
167
|
+
it 'counts a date placeable as one word in the word count' do
|
168
|
+
text = 'Today is Tues. March 3rd, 2011.'
|
169
|
+
ws = WordCountAnalyzer::Counter.new(
|
170
|
+
text: text,
|
171
|
+
date: 'count_as_one'
|
172
|
+
)
|
173
|
+
expect(ws.count).to eq(3)
|
174
|
+
end
|
175
|
+
|
176
|
+
it 'sets count_as_one as the default option' do
|
177
|
+
text = 'Today is Tues. March 3rd, 2011.'
|
178
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
179
|
+
expect(ws.count).to eq(6)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
context 'number' do
|
184
|
+
it 'handles an invalid number argument value' do
|
185
|
+
text = 'hello world.'
|
186
|
+
ws = WordCountAnalyzer::Counter.new(text: text, number: 'hello')
|
187
|
+
expect { ws.count }.to raise_error('The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'counts a number as a word' do
|
191
|
+
text = 'There is $300 in the safe. The password is 1234.'
|
192
|
+
ws = WordCountAnalyzer::Counter.new(
|
193
|
+
text: text,
|
194
|
+
number: 'count'
|
195
|
+
)
|
196
|
+
expect(ws.count).to eq(10)
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'ignores numbers in the word count' do
|
200
|
+
text = 'There is $300 in the safe. The password is 1234.'
|
201
|
+
ws = WordCountAnalyzer::Counter.new(
|
202
|
+
text: text,
|
203
|
+
number: 'ignore'
|
204
|
+
)
|
205
|
+
expect(ws.count).to eq(8)
|
206
|
+
end
|
207
|
+
|
208
|
+
it 'sets count as the default option' do
|
209
|
+
text = 'There is $300 in the safe. The password is 1234.'
|
210
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
211
|
+
expect(ws.count).to eq(10)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
context 'number_list' do
|
216
|
+
it 'handles an invalid number argument value' do
|
217
|
+
text = 'hello world.'
|
218
|
+
ws = WordCountAnalyzer::Counter.new(text: text, numbered_list: 'hello')
|
219
|
+
expect { ws.count }.to raise_error('The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'counts a numbered list number as a word' do
|
223
|
+
text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
|
224
|
+
ws = WordCountAnalyzer::Counter.new(
|
225
|
+
text: text,
|
226
|
+
numbered_list: 'count'
|
227
|
+
)
|
228
|
+
expect(ws.count).to eq(17)
|
229
|
+
end
|
230
|
+
|
231
|
+
it 'ignores numbered list numbers' do
|
232
|
+
text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
|
233
|
+
ws = WordCountAnalyzer::Counter.new(
|
234
|
+
text: text,
|
235
|
+
numbered_list: 'ignore'
|
236
|
+
)
|
237
|
+
expect(ws.count).to eq(14)
|
238
|
+
end
|
239
|
+
|
240
|
+
it 'sets count as the default option' do
|
241
|
+
text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
|
242
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
243
|
+
expect(ws.count).to eq(17)
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
context 'xhtml' do
|
248
|
+
it 'handles an invalid number argument value' do
|
249
|
+
text = 'hello world.'
|
250
|
+
ws = WordCountAnalyzer::Counter.new(text: text, xhtml: 'hello')
|
251
|
+
expect { ws.count }.to raise_error('The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`')
|
252
|
+
end
|
253
|
+
|
254
|
+
it 'removes all xhtml from the text' do
|
255
|
+
text = "<span class='orange-text'>Hello world</span>"
|
256
|
+
ws = WordCountAnalyzer::Counter.new(
|
257
|
+
text: text,
|
258
|
+
xhtml: 'remove'
|
259
|
+
)
|
260
|
+
expect(ws.count).to eq(2)
|
261
|
+
end
|
262
|
+
|
263
|
+
it 'keeps xhtml in the text' do
|
264
|
+
text = "<span class='orange-text'>Hello world</span>"
|
265
|
+
ws = WordCountAnalyzer::Counter.new(
|
266
|
+
text: text,
|
267
|
+
xhtml: 'keep',
|
268
|
+
forward_slash: 'count_as_one'
|
269
|
+
)
|
270
|
+
expect(ws.count).to eq(3)
|
271
|
+
end
|
272
|
+
|
273
|
+
it 'keeps xhtml in the text' do
|
274
|
+
text = "<span class='orange-text'>Hello world</span>"
|
275
|
+
ws = WordCountAnalyzer::Counter.new(
|
276
|
+
text: text,
|
277
|
+
xhtml: 'keep'
|
278
|
+
)
|
279
|
+
expect(ws.count).to eq(4)
|
280
|
+
end
|
281
|
+
|
282
|
+
it 'sets remove as the default option' do
|
283
|
+
text = "<span class='orange-text'>Hello world</span>"
|
284
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
285
|
+
expect(ws.count).to eq(2)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
context 'forward_slash' do
|
290
|
+
it 'handles an invalid number argument value' do
|
291
|
+
text = 'hello world.'
|
292
|
+
ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'hello')
|
293
|
+
expect { ws.count }.to raise_error('The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`')
|
294
|
+
end
|
295
|
+
|
296
|
+
it 'counts a forward slash as multiple words (except dates) #001' do
|
297
|
+
text = "She/he/it said hello. 4/22/2013."
|
298
|
+
ws = WordCountAnalyzer::Counter.new(
|
299
|
+
text: text,
|
300
|
+
forward_slash: 'count_as_multiple_except_dates'
|
301
|
+
)
|
302
|
+
expect(ws.count).to eq(6)
|
303
|
+
end
|
304
|
+
|
305
|
+
it 'counts a forward slash as multiple words #002' do
|
306
|
+
text = "She/he/it said hello. 4/22/2013."
|
307
|
+
ws = WordCountAnalyzer::Counter.new(
|
308
|
+
text: text,
|
309
|
+
forward_slash: 'count_as_multiple'
|
310
|
+
)
|
311
|
+
expect(ws.count).to eq(8)
|
312
|
+
end
|
313
|
+
|
314
|
+
it 'counts a forward slash as multiple words #003' do
|
315
|
+
text = "She/he/it said hello. 4/22/2013."
|
316
|
+
ws = WordCountAnalyzer::Counter.new(
|
317
|
+
text: text,
|
318
|
+
forward_slash: 'count_as_multiple',
|
319
|
+
date: 'count_as_one'
|
320
|
+
)
|
321
|
+
expect(ws.count).to eq(6)
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'counts a forward slash as one word' do
|
325
|
+
text = "She/he/it said hello."
|
326
|
+
ws = WordCountAnalyzer::Counter.new(
|
327
|
+
text: text,
|
328
|
+
forward_slash: 'count_as_one'
|
329
|
+
)
|
330
|
+
expect(ws.count).to eq(3)
|
331
|
+
end
|
332
|
+
|
333
|
+
it 'sets count_as_multiple_except_dates as the default option' do
|
334
|
+
text = "She/he/it said hello. 4/22/2013."
|
335
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
336
|
+
expect(ws.count).to eq(6)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
context 'backslash' do
|
341
|
+
it 'handles an invalid number argument value' do
|
342
|
+
text = 'hello world.'
|
343
|
+
ws = WordCountAnalyzer::Counter.new(text: text, backslash: 'hello')
|
344
|
+
expect { ws.count }.to raise_error('The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
|
345
|
+
end
|
346
|
+
|
347
|
+
it 'counts a token with backslashes as one word' do
|
348
|
+
text = 'The file location is c:\Users\johndoe'
|
349
|
+
ws = WordCountAnalyzer::Counter.new(
|
350
|
+
text: text,
|
351
|
+
backslash: 'count_as_one'
|
352
|
+
)
|
353
|
+
expect(ws.count).to eq(5)
|
354
|
+
end
|
355
|
+
|
356
|
+
it 'counts a token with backslashes as multiple words' do
|
357
|
+
text = 'The file location is c:\Users\johndoe'
|
358
|
+
ws = WordCountAnalyzer::Counter.new(
|
359
|
+
text: text,
|
360
|
+
backslash: 'count_as_multiple'
|
361
|
+
)
|
362
|
+
expect(ws.count).to eq(7)
|
363
|
+
end
|
364
|
+
|
365
|
+
it 'sets count_as_one as the default option' do
|
366
|
+
text = 'The file location is c:\Users\johndoe'
|
367
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
368
|
+
expect(ws.count).to eq(5)
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
context 'dotted_line' do
|
373
|
+
it 'handles an invalid number argument value' do
|
374
|
+
text = 'hello world.'
|
375
|
+
ws = WordCountAnalyzer::Counter.new(text: text, dotted_line: 'hello')
|
376
|
+
expect { ws.count }.to raise_error('The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
|
377
|
+
end
|
378
|
+
|
379
|
+
it 'ignores continuous strings of dots in the word count' do
|
380
|
+
text = 'Here is one …………………………………………………………………… and another ......'
|
381
|
+
ws = WordCountAnalyzer::Counter.new(
|
382
|
+
text: text,
|
383
|
+
dotted_line: 'ignore'
|
384
|
+
)
|
385
|
+
expect(ws.count).to eq(5)
|
386
|
+
end
|
387
|
+
|
388
|
+
it 'counts a continuous string of dots as a word' do
|
389
|
+
text = 'Here is one …………………………………………………………………… and another ......'
|
390
|
+
ws = WordCountAnalyzer::Counter.new(
|
391
|
+
text: text,
|
392
|
+
dotted_line: 'count'
|
393
|
+
)
|
394
|
+
expect(ws.count).to eq(7)
|
395
|
+
end
|
396
|
+
|
397
|
+
it 'sets ignore as the default option' do
|
398
|
+
text = 'Here is one …………………………………………………………………… and another ......'
|
399
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
400
|
+
expect(ws.count).to eq(5)
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
404
|
+
context 'dashed_line' do
|
405
|
+
it 'handles an invalid number argument value' do
|
406
|
+
text = 'hello world.'
|
407
|
+
ws = WordCountAnalyzer::Counter.new(text: text, dashed_line: 'hello')
|
408
|
+
expect { ws.count }.to raise_error('The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
|
409
|
+
end
|
410
|
+
|
411
|
+
it 'ignores continuous strings of dashes in the word count' do
|
412
|
+
text = 'Here is one ----- and another -----'
|
413
|
+
ws = WordCountAnalyzer::Counter.new(
|
414
|
+
text: text,
|
415
|
+
dashed_line: 'ignore'
|
416
|
+
)
|
417
|
+
expect(ws.count).to eq(5)
|
418
|
+
end
|
419
|
+
|
420
|
+
it 'counts a continuous string of dashes as a word' do
|
421
|
+
text = 'Here is one ----- and another -----'
|
422
|
+
ws = WordCountAnalyzer::Counter.new(
|
423
|
+
text: text,
|
424
|
+
dashed_line: 'count'
|
425
|
+
)
|
426
|
+
expect(ws.count).to eq(7)
|
427
|
+
end
|
428
|
+
|
429
|
+
it 'sets ignore as the default option' do
|
430
|
+
text = 'Here is one ----- and another -----'
|
431
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
432
|
+
expect(ws.count).to eq(5)
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
context 'underscore' do
|
437
|
+
it 'handles an invalid number argument value' do
|
438
|
+
text = 'hello world.'
|
439
|
+
ws = WordCountAnalyzer::Counter.new(text: text, underscore: 'hello')
|
440
|
+
expect { ws.count }.to raise_error('The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
|
441
|
+
end
|
442
|
+
|
443
|
+
it 'ignores continuous strings of underscores in the word count' do
|
444
|
+
text = "Here is one ______ and another ______"
|
445
|
+
ws = WordCountAnalyzer::Counter.new(
|
446
|
+
text: text,
|
447
|
+
underscore: 'ignore'
|
448
|
+
)
|
449
|
+
expect(ws.count).to eq(5)
|
450
|
+
end
|
451
|
+
|
452
|
+
it 'counts a continuous string of underscores as a word' do
|
453
|
+
text = 'Here is one ______ and another ______'
|
454
|
+
ws = WordCountAnalyzer::Counter.new(
|
455
|
+
text: text,
|
456
|
+
underscore: 'count'
|
457
|
+
)
|
458
|
+
expect(ws.count).to eq(7)
|
459
|
+
end
|
460
|
+
|
461
|
+
it 'sets ignore as the default option' do
|
462
|
+
text = 'Here is one ______ and another ______'
|
463
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
464
|
+
expect(ws.count).to eq(5)
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
context 'stray_punctuation' do
|
469
|
+
it 'handles an invalid number argument value' do
|
470
|
+
text = 'hello world.'
|
471
|
+
ws = WordCountAnalyzer::Counter.new(text: text, stray_punctuation: 'hello')
|
472
|
+
expect { ws.count }.to raise_error('The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
|
473
|
+
end
|
474
|
+
|
475
|
+
it 'ignores continuous strings of underscores in the word count' do
|
476
|
+
text = 'Hello world ? This is another - sentence .'
|
477
|
+
ws = WordCountAnalyzer::Counter.new(
|
478
|
+
text: text,
|
479
|
+
stray_punctuation: 'ignore'
|
480
|
+
)
|
481
|
+
expect(ws.count).to eq(6)
|
482
|
+
end
|
483
|
+
|
484
|
+
it 'counts a continuous string of underscores as a word' do
|
485
|
+
text = 'Hello world ? This is another - sentence .'
|
486
|
+
ws = WordCountAnalyzer::Counter.new(
|
487
|
+
text: text,
|
488
|
+
stray_punctuation: 'count'
|
489
|
+
)
|
490
|
+
expect(ws.count).to eq(9)
|
491
|
+
end
|
492
|
+
|
493
|
+
it 'sets ignore as the default option' do
|
494
|
+
text = 'Hello world ? This is another - sentence .'
|
495
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
496
|
+
expect(ws.count).to eq(6)
|
497
|
+
end
|
498
|
+
end
|
499
|
+
|
500
|
+
it 'counts the words in a string #001' do
|
501
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
502
|
+
ws = WordCountAnalyzer::Counter.new(
|
503
|
+
text: text,
|
504
|
+
ellipsis: 'ignore',
|
505
|
+
hyperlink: 'count_as_one',
|
506
|
+
contraction: 'count_as_one',
|
507
|
+
hyphenated_word: 'count_as_one',
|
508
|
+
date: 'no_special_treatment',
|
509
|
+
number: 'count',
|
510
|
+
numbered_list: 'count',
|
511
|
+
xhtml: 'remove',
|
512
|
+
forward_slash: 'count_as_one',
|
513
|
+
backslash: 'count_as_one',
|
514
|
+
dotted_line: 'ignore',
|
515
|
+
dashed_line: 'ignore',
|
516
|
+
underscore: 'ignore',
|
517
|
+
stray_punctuation: 'ignore'
|
518
|
+
)
|
519
|
+
expect(ws.count).to eq(62)
|
520
|
+
end
|
521
|
+
|
522
|
+
it 'counts the words in a string #002' do
|
523
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
524
|
+
ws = WordCountAnalyzer::Counter.new(
|
525
|
+
text: text,
|
526
|
+
ellipsis: 'no_special_treatment',
|
527
|
+
hyperlink: 'no_special_treatment',
|
528
|
+
contraction: 'count_as_multiple',
|
529
|
+
hyphenated_word: 'count_as_multiple',
|
530
|
+
date: 'count_as_one',
|
531
|
+
number: 'ignore',
|
532
|
+
numbered_list: 'ignore',
|
533
|
+
xhtml: 'keep',
|
534
|
+
forward_slash: 'count_as_multiple',
|
535
|
+
backslash: 'count_as_multiple',
|
536
|
+
dotted_line: 'count',
|
537
|
+
dashed_line: 'count',
|
538
|
+
underscore: 'count',
|
539
|
+
stray_punctuation: 'count'
|
540
|
+
)
|
541
|
+
expect(ws.count).to eq(77)
|
542
|
+
end
|
543
|
+
|
544
|
+
it 'counts the words in a string #003' do
|
545
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
546
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
547
|
+
expect(ws.count).to eq(64)
|
548
|
+
end
|
549
|
+
|
550
|
+
it 'counts the words in a string #004' do
|
551
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
552
|
+
ws = WordCountAnalyzer::Counter.new(text: text, forward_slash: 'count_as_multiple')
|
553
|
+
expect(ws.count).to eq(66)
|
554
|
+
end
|
555
|
+
|
556
|
+
context 'Pages Word Count' do
|
557
|
+
it 'reverse engineers Pages word count #001' do
|
558
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
559
|
+
ws = WordCountAnalyzer::Counter.new(
|
560
|
+
text: text,
|
561
|
+
ellipsis: 'no_special_treatment',
|
562
|
+
hyperlink: 'split_at_period',
|
563
|
+
contraction: 'count_as_one',
|
564
|
+
hyphenated_word: 'count_as_multiple',
|
565
|
+
date: 'no_special_treatment',
|
566
|
+
number: 'count',
|
567
|
+
numbered_list: 'count',
|
568
|
+
xhtml: 'keep',
|
569
|
+
forward_slash: 'count_as_multiple',
|
570
|
+
backslash: 'count_as_multiple',
|
571
|
+
dotted_line: 'ignore',
|
572
|
+
dashed_line: 'ignore',
|
573
|
+
underscore: 'ignore',
|
574
|
+
stray_punctuation: 'ignore'
|
575
|
+
)
|
576
|
+
expect(ws.count).to eq(79)
|
577
|
+
end
|
578
|
+
|
579
|
+
it 'reverse engineers Pages word count #002' do
|
580
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
581
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
582
|
+
expect(ws.pages_count).to eq(79)
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
context 'Microsoft Word Count' do
|
587
|
+
it 'reverse engineers the Microsoft Word / wc (Unix) word count #001' do
|
588
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
589
|
+
ws = WordCountAnalyzer::Counter.new(
|
590
|
+
text: text,
|
591
|
+
ellipsis: 'no_special_treatment',
|
592
|
+
hyperlink: 'count_as_one',
|
593
|
+
contraction: 'count_as_one',
|
594
|
+
hyphenated_word: 'count_as_one',
|
595
|
+
date: 'no_special_treatment',
|
596
|
+
number: 'count',
|
597
|
+
numbered_list: 'count',
|
598
|
+
xhtml: 'keep',
|
599
|
+
forward_slash: 'count_as_one',
|
600
|
+
backslash: 'count_as_one',
|
601
|
+
dotted_line: 'count',
|
602
|
+
dashed_line: 'count',
|
603
|
+
underscore: 'count',
|
604
|
+
stray_punctuation: 'count'
|
605
|
+
)
|
606
|
+
expect(ws.count).to eq(71)
|
607
|
+
end
|
608
|
+
|
609
|
+
it 'reverse engineers the Microsoft Word / wc (Unix) word count #002' do
|
610
|
+
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
611
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
612
|
+
expect(ws.mword_count).to eq(71)
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
context 'example sentences' do
|
617
|
+
it 'String with common words (no edge cases) #001' do
|
618
|
+
ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains nothing crazy.')
|
619
|
+
expect(ws.count).to eq(5)
|
620
|
+
end
|
621
|
+
|
622
|
+
it 'String with a number #002' do
|
623
|
+
ws = WordCountAnalyzer::Counter.new(text: 'This sentence contains 1 number.')
|
624
|
+
expect(ws.count).to eq(5)
|
625
|
+
end
|
626
|
+
|
627
|
+
it 'String with a date #003' do
|
628
|
+
ws = WordCountAnalyzer::Counter.new(text: 'Today is Monday, April 4th, 2011.')
|
629
|
+
expect(ws.count).to eq(6)
|
630
|
+
end
|
631
|
+
|
632
|
+
it 'does not split on unicode chars' do
|
633
|
+
ws = WordCountAnalyzer::Counter.new(text: 'São Paulo')
|
634
|
+
expect(ws.count).to eq(2)
|
635
|
+
end
|
636
|
+
|
637
|
+
it 'should not count HTML tags' do
|
638
|
+
ws = WordCountAnalyzer::Counter.new(text: "<a href=\"http://thefamousfox.com\">the brown fox</a> jumped over the lazy dog")
|
639
|
+
expect(ws.count).to eq(8)
|
640
|
+
end
|
641
|
+
|
642
|
+
it 'should handle special characters' do
|
643
|
+
ws = WordCountAnalyzer::Counter.new(text: "the \"brown\" fox 'jumped' | over \\ the / lazy dog")
|
644
|
+
expect(ws.count).to eq(8)
|
645
|
+
end
|
646
|
+
end
|
647
|
+
end
|