word_count_analyzer 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/lib/word_count_analyzer/ellipsis.rb +1 -1
- data/lib/word_count_analyzer/slash.rb +3 -0
- data/lib/word_count_analyzer/version.rb +1 -1
- data/spec/word_count_analyzer/counter_spec.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b715b4c2304ae956644121693d584b0b37638845
|
4
|
+
data.tar.gz: 3b6d753132d5b9e511378ec06f990efa21fcc86b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 504f3e98b336ebcce4129b137e5293f3fbe38593fd935d55b56d9807f77c17a7e5ae955c4fba41fde8fe8a9e725232253038e5b912ff892eed8ece89f1819b34
|
7
|
+
data.tar.gz: 2ae63d08ea641d00f60f4d83f07e1a376cd51cdc632bafcf96a1c3e6b7b173c5e970625fa51749b879893e0386a202bebf799448703373cccd6c855bf8c56040
|
data/README.md
CHANGED
@@ -151,6 +151,8 @@ WordCountAnalyzer::Counter.new(
|
|
151
151
|
|
152
152
|
##### `date`
|
153
153
|
**default** = `'no_special_treatment'`
|
154
|
+
- `'no_special_treatment'`
|
155
|
+
Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
|
154
156
|
- `'count_as_one'`
|
155
157
|
Counts a date as one word. This is more commonly seen in translation CAT tools where a date is thought of as a *placeable* that can usually be automatically translated. Examples:
|
156
158
|
- Monday, April 4th, 2011 (1 word)
|
@@ -163,8 +165,6 @@ WordCountAnalyzer::Counter.new(
|
|
163
165
|
- 2003 November 9 (1 word)
|
164
166
|
- 2003-Nov-9 (1 word)
|
165
167
|
- and others...
|
166
|
-
- `'no_special_treatment'`
|
167
|
-
Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
|
168
168
|
|
169
169
|
<hr>
|
170
170
|
|
@@ -197,15 +197,15 @@ WordCountAnalyzer::Counter.new(
|
|
197
197
|
|
198
198
|
##### `forward_slash`
|
199
199
|
**default** = `'count_as_multiple_except_dates'`
|
200
|
-
- `'count_as_one'`
|
201
|
-
Counts any tokens that include a forward slash as one word. Example:
|
202
|
-
- she/he/it (1 word)
|
203
|
-
- `'count_as_multiple'`
|
204
|
-
Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
|
205
|
-
- she/he/it (3 words)
|
206
200
|
- `'count_as_multiple_except_dates'`
|
207
201
|
Separates any tokens that include a forward slash (except dates) at the slash(s) and counts each token individually. Example:
|
208
202
|
- she/he/it 4/25/2014 (4 words)
|
203
|
+
- `'count_as_multiple'`
|
204
|
+
Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
|
205
|
+
- she/he/it (3 words)
|
206
|
+
- `'count_as_one'`
|
207
|
+
Counts any tokens that include a forward slash as one word. Example:
|
208
|
+
- she/he/it (1 word)
|
209
209
|
|
210
210
|
<hr>
|
211
211
|
|
@@ -222,37 +222,37 @@ WordCountAnalyzer::Counter.new(
|
|
222
222
|
|
223
223
|
##### `dotted_line`
|
224
224
|
**default** = `'ignore'`
|
225
|
-
- `'count'`
|
226
|
-
Counts a dotted line as one word.
|
227
225
|
- `'ignore'`
|
228
226
|
Ignores any dotted lines in the string and does not count them towards the word count.
|
227
|
+
- `'count'`
|
228
|
+
Counts a dotted line as one word.
|
229
229
|
|
230
230
|
<hr>
|
231
231
|
|
232
232
|
##### `dashed_line`
|
233
233
|
**default** = `'ignore'`
|
234
|
+
- `'ignore'`
|
235
|
+
Ignores any dashed lines in the string and does not count them towards the word count.
|
234
236
|
- `'count'`
|
235
237
|
Counts a dashed line as one word.
|
236
|
-
- `'ignore'`
|
237
|
-
Ignores any dashed lines in the string and does not count them towards the word count.
|
238
238
|
|
239
239
|
<hr>
|
240
240
|
|
241
241
|
##### `underscore`
|
242
242
|
**default** = `'ignore'`
|
243
|
+
- `'ignore'`
|
244
|
+
Ignores any series of underscores in the string and does not count them towards the word count.
|
243
245
|
- `'count'`
|
244
246
|
Counts a series of underscores as one word.
|
245
|
-
- `'ignore'`
|
246
|
-
Ignores any series of underscores in the string and does not count them towards the word count.
|
247
247
|
|
248
248
|
<hr>
|
249
249
|
|
250
250
|
##### `stray_punctuation`
|
251
251
|
**default** = `'ignore'`
|
252
|
+
- `'ignore'`
|
253
|
+
Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.
|
252
254
|
- `'count'`
|
253
255
|
Counts a punctuation mark surrounded on both sides by a whitespace as one word.
|
254
|
-
- `'ignore'`
|
255
|
-
Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.
|
256
256
|
|
257
257
|
### Gray Area Details
|
258
258
|
|
@@ -9,7 +9,7 @@ module WordCountAnalyzer
|
|
9
9
|
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
10
10
|
FOUR_SPACE_REGEX = /(?<=[a-z])(\.\s){3}\.(\z|$|\n)/
|
11
11
|
|
12
|
-
OTHER_THREE_PERIOD_REGEX = /[^\.]\.{3}([^\.]|$)/
|
12
|
+
OTHER_THREE_PERIOD_REGEX = /(?<=[^\.])\.{3}(?=([^\.]|$))/
|
13
13
|
|
14
14
|
UNICODE_ELLIPSIS = /(?<=[^…])…{1}(?=[^…])/
|
15
15
|
|
@@ -60,6 +60,7 @@ module WordCountAnalyzer
|
|
60
60
|
processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
61
61
|
match.split(/\/+/).join(' ')
|
62
62
|
end
|
63
|
+
processed_string
|
63
64
|
end
|
64
65
|
|
65
66
|
def replace_forward_slashes_except_dates
|
@@ -68,6 +69,7 @@ module WordCountAnalyzer
|
|
68
69
|
except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
69
70
|
match.split(/\/+/).join(' ')
|
70
71
|
end
|
72
|
+
except_date_string
|
71
73
|
end
|
72
74
|
|
73
75
|
def backslash_occurences
|
@@ -79,6 +81,7 @@ module WordCountAnalyzer
|
|
79
81
|
processed_string.gsub!(BACKSLASH_REGEX).each do |match|
|
80
82
|
' word ' * match.split(/\\+/).length
|
81
83
|
end
|
84
|
+
processed_string
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -553,6 +553,12 @@ RSpec.describe WordCountAnalyzer::Counter do
|
|
553
553
|
expect(ws.count).to eq(66)
|
554
554
|
end
|
555
555
|
|
556
|
+
it 'counts the words in a string #005' do
|
557
|
+
text = "Hello world... 11/22/2013"
|
558
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
559
|
+
expect(ws.count).to eq(3)
|
560
|
+
end
|
561
|
+
|
556
562
|
context 'Pages Word Count' do
|
557
563
|
it 'reverse engineers Pages word count #001' do
|
558
564
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|