word_count_analyzer 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/lib/word_count_analyzer/ellipsis.rb +1 -1
- data/lib/word_count_analyzer/slash.rb +3 -0
- data/lib/word_count_analyzer/version.rb +1 -1
- data/spec/word_count_analyzer/counter_spec.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b715b4c2304ae956644121693d584b0b37638845
|
4
|
+
data.tar.gz: 3b6d753132d5b9e511378ec06f990efa21fcc86b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 504f3e98b336ebcce4129b137e5293f3fbe38593fd935d55b56d9807f77c17a7e5ae955c4fba41fde8fe8a9e725232253038e5b912ff892eed8ece89f1819b34
|
7
|
+
data.tar.gz: 2ae63d08ea641d00f60f4d83f07e1a376cd51cdc632bafcf96a1c3e6b7b173c5e970625fa51749b879893e0386a202bebf799448703373cccd6c855bf8c56040
|
data/README.md
CHANGED
@@ -151,6 +151,8 @@ WordCountAnalyzer::Counter.new(
|
|
151
151
|
|
152
152
|
##### `date`
|
153
153
|
**default** = `'no_special_treatment'`
|
154
|
+
- `'no_special_treatment'`
|
155
|
+
Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
|
154
156
|
- `'count_as_one'`
|
155
157
|
Counts a date as one word. This is more commonly seen in translation CAT tools where a date is thought of as a *placeable* that can usually be automatically translated. Examples:
|
156
158
|
- Monday, April 4th, 2011 (1 word)
|
@@ -163,8 +165,6 @@ WordCountAnalyzer::Counter.new(
|
|
163
165
|
- 2003 November 9 (1 word)
|
164
166
|
- 2003-Nov-9 (1 word)
|
165
167
|
- and others...
|
166
|
-
- `'no_special_treatment'`
|
167
|
-
Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
|
168
168
|
|
169
169
|
<hr>
|
170
170
|
|
@@ -197,15 +197,15 @@ WordCountAnalyzer::Counter.new(
|
|
197
197
|
|
198
198
|
##### `forward_slash`
|
199
199
|
**default** = `'count_as_multiple_except_dates'`
|
200
|
-
- `'count_as_one'`
|
201
|
-
Counts any tokens that include a forward slash as one word. Example:
|
202
|
-
- she/he/it (1 word)
|
203
|
-
- `'count_as_multiple'`
|
204
|
-
Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
|
205
|
-
- she/he/it (3 words)
|
206
200
|
- `'count_as_multiple_except_dates'`
|
207
201
|
Separates any tokens that include a forward slash (except dates) at the slash(s) and counts each token individually. Example:
|
208
202
|
- she/he/it 4/25/2014 (4 words)
|
203
|
+
- `'count_as_multiple'`
|
204
|
+
Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
|
205
|
+
- she/he/it (3 words)
|
206
|
+
- `'count_as_one'`
|
207
|
+
Counts any tokens that include a forward slash as one word. Example:
|
208
|
+
- she/he/it (1 word)
|
209
209
|
|
210
210
|
<hr>
|
211
211
|
|
@@ -222,37 +222,37 @@ WordCountAnalyzer::Counter.new(
|
|
222
222
|
|
223
223
|
##### `dotted_line`
|
224
224
|
**default** = `'ignore'`
|
225
|
-
- `'count'`
|
226
|
-
Counts a dotted line as one word.
|
227
225
|
- `'ignore'`
|
228
226
|
Ignores any dotted lines in the string and does not count them towards the word count.
|
227
|
+
- `'count'`
|
228
|
+
Counts a dotted line as one word.
|
229
229
|
|
230
230
|
<hr>
|
231
231
|
|
232
232
|
##### `dashed_line`
|
233
233
|
**default** = `'ignore'`
|
234
|
+
- `'ignore'`
|
235
|
+
Ignores any dashed lines in the string and does not count them towards the word count.
|
234
236
|
- `'count'`
|
235
237
|
Counts a dashed line as one word.
|
236
|
-
- `'ignore'`
|
237
|
-
Ignores any dashed lines in the string and does not count them towards the word count.
|
238
238
|
|
239
239
|
<hr>
|
240
240
|
|
241
241
|
##### `underscore`
|
242
242
|
**default** = `'ignore'`
|
243
|
+
- `'ignore'`
|
244
|
+
Ignores any series of underscores in the string and does not count them towards the word count.
|
243
245
|
- `'count'`
|
244
246
|
Counts a series of underscores as one word.
|
245
|
-
- `'ignore'`
|
246
|
-
Ignores any series of underscores in the string and does not count them towards the word count.
|
247
247
|
|
248
248
|
<hr>
|
249
249
|
|
250
250
|
##### `stray_punctuation`
|
251
251
|
**default** = `'ignore'`
|
252
|
+
- `'ignore'`
|
253
|
+
Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.
|
252
254
|
- `'count'`
|
253
255
|
Counts a punctuation mark surrounded on both sides by a whitespace as one word.
|
254
|
-
- `'ignore'`
|
255
|
-
Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.
|
256
256
|
|
257
257
|
### Gray Area Details
|
258
258
|
|
@@ -9,7 +9,7 @@ module WordCountAnalyzer
|
|
9
9
|
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
10
10
|
FOUR_SPACE_REGEX = /(?<=[a-z])(\.\s){3}\.(\z|$|\n)/
|
11
11
|
|
12
|
-
OTHER_THREE_PERIOD_REGEX = /[^\.]\.{3}([^\.]|$)/
|
12
|
+
OTHER_THREE_PERIOD_REGEX = /(?<=[^\.])\.{3}(?=([^\.]|$))/
|
13
13
|
|
14
14
|
UNICODE_ELLIPSIS = /(?<=[^…])…{1}(?=[^…])/
|
15
15
|
|
@@ -60,6 +60,7 @@ module WordCountAnalyzer
|
|
60
60
|
processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
61
61
|
match.split(/\/+/).join(' ')
|
62
62
|
end
|
63
|
+
processed_string
|
63
64
|
end
|
64
65
|
|
65
66
|
def replace_forward_slashes_except_dates
|
@@ -68,6 +69,7 @@ module WordCountAnalyzer
|
|
68
69
|
except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
69
70
|
match.split(/\/+/).join(' ')
|
70
71
|
end
|
72
|
+
except_date_string
|
71
73
|
end
|
72
74
|
|
73
75
|
def backslash_occurences
|
@@ -79,6 +81,7 @@ module WordCountAnalyzer
|
|
79
81
|
processed_string.gsub!(BACKSLASH_REGEX).each do |match|
|
80
82
|
' word ' * match.split(/\\+/).length
|
81
83
|
end
|
84
|
+
processed_string
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -553,6 +553,12 @@ RSpec.describe WordCountAnalyzer::Counter do
|
|
553
553
|
expect(ws.count).to eq(66)
|
554
554
|
end
|
555
555
|
|
556
|
+
it 'counts the words in a string #005' do
|
557
|
+
text = "Hello world... 11/22/2013"
|
558
|
+
ws = WordCountAnalyzer::Counter.new(text: text)
|
559
|
+
expect(ws.count).to eq(3)
|
560
|
+
end
|
561
|
+
|
556
562
|
context 'Pages Word Count' do
|
557
563
|
it 'reverse engineers Pages word count #001' do
|
558
564
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|