word_count_analyzer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +554 -0
  8. data/Rakefile +2 -0
  9. data/lib/word_count_analyzer.rb +14 -0
  10. data/lib/word_count_analyzer/analyzer.rb +34 -0
  11. data/lib/word_count_analyzer/contraction.rb +176 -0
  12. data/lib/word_count_analyzer/counter.rb +230 -0
  13. data/lib/word_count_analyzer/date.rb +149 -0
  14. data/lib/word_count_analyzer/ellipsis.rb +48 -0
  15. data/lib/word_count_analyzer/hyperlink.rb +53 -0
  16. data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
  17. data/lib/word_count_analyzer/number.rb +23 -0
  18. data/lib/word_count_analyzer/numbered_list.rb +61 -0
  19. data/lib/word_count_analyzer/punctuation.rb +52 -0
  20. data/lib/word_count_analyzer/slash.rb +84 -0
  21. data/lib/word_count_analyzer/version.rb +3 -0
  22. data/lib/word_count_analyzer/xhtml.rb +26 -0
  23. data/spec/spec_helper.rb +1 -0
  24. data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
  25. data/spec/word_count_analyzer/contraction_spec.rb +124 -0
  26. data/spec/word_count_analyzer/counter_spec.rb +647 -0
  27. data/spec/word_count_analyzer/date_spec.rb +257 -0
  28. data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
  29. data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
  30. data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
  31. data/spec/word_count_analyzer/number_spec.rb +63 -0
  32. data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
  33. data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
  34. data/spec/word_count_analyzer/slash_spec.rb +105 -0
  35. data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
  36. data/word_count_analyzer.gemspec +26 -0
  37. metadata +153 -0
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,14 @@
1
+ require "word_count_analyzer/version"
2
+ require "word_count_analyzer/analyzer"
3
+ require "word_count_analyzer/counter"
4
+ require "word_count_analyzer/contraction"
5
+ require "word_count_analyzer/hyperlink"
6
+ require "word_count_analyzer/hyphenated_word"
7
+ require "word_count_analyzer/date"
8
+ require "word_count_analyzer/ellipsis"
9
+ require "word_count_analyzer/numbered_list"
10
+ require "word_count_analyzer/xhtml"
11
+ require "word_count_analyzer/number"
12
+ require "word_count_analyzer/slash"
13
+ require "word_count_analyzer/punctuation"
14
+ require "engtagger"
@@ -0,0 +1,34 @@
1
+ module WordCountAnalyzer
2
+ class Analyzer
3
+ attr_reader :text, :tgr
4
+ def initialize(text:)
5
+ @text = text
6
+ @tgr = EngTagger.new
7
+ end
8
+
9
+ def analyze
10
+ analysis = {}
11
+ analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new(string: text).occurences
12
+ contraction_count = 0
13
+ hyphenated_word_count = 0
14
+ WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index|
15
+ contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tgr, hyphen: 'single').contraction?
16
+ hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word?
17
+ end
18
+ analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new(string: text).occurences
19
+ analysis['contraction'] = contraction_count
20
+ analysis['hyphenated_word'] = hyphenated_word_count
21
+ analysis['date'] = WordCountAnalyzer::Date.new(string: text).occurences
22
+ analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurences
23
+ analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurences
24
+ analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurences
25
+ analysis['forward_slash'] = WordCountAnalyzer::Slash.new(string: text).forward_slash_occurences
26
+ analysis['backslash'] = WordCountAnalyzer::Slash.new(string: text).backslash_occurences
27
+ analysis['dotted_line'] = WordCountAnalyzer::Punctuation.new(string: text).dotted_line_ocurrances
28
+ analysis['dashed_line'] = WordCountAnalyzer::Punctuation.new(string: text).dashed_line_ocurrances
29
+ analysis['underscore'] = WordCountAnalyzer::Punctuation.new(string: text).underscore_ocurrances
30
+ analysis['stray_punctuation'] = WordCountAnalyzer::Punctuation.new(string: text).stray_punctuation_occurences
31
+ analysis
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,176 @@
1
+ module WordCountAnalyzer
2
+ class Contraction
3
+ CONTRACTIONS = {
4
+ "i'm" => "I am",
5
+ "i'll" => "I will",
6
+ "i'd" => "I would",
7
+ "i've" => "I have",
8
+ "i'd" => "I had",
9
+ "you're" => "you are",
10
+ "you'll" => "you will",
11
+ "you'd" => "you would",
12
+ "you've" => "you have",
13
+ "you'd" => "you had",
14
+ "he's" => "he is",
15
+ "he'll" => "he will",
16
+ "he'd" => "he would",
17
+ "he's" => "he has",
18
+ "he'd" => "he had",
19
+ "she's" => "she is",
20
+ "she'll" => "she will",
21
+ "she'd" => "she would",
22
+ "she's" => "she has",
23
+ "she'd" => "she had",
24
+ "it's" => "it is",
25
+ "'tis" => "it is",
26
+ "it'll" => "it will",
27
+ "it'd" => "it would",
28
+ "it's" => "it has",
29
+ "it'd" => "it had",
30
+ "we're" => "we are",
31
+ "we'll" => "we will",
32
+ "we'd" => "we would",
33
+ "we've" => "we have",
34
+ "we'd" => "we had",
35
+ "they're" => "they are",
36
+ "they'll" => "they will",
37
+ "they'd" => "they would",
38
+ "they've" => "they have",
39
+ "they'd" => "they had",
40
+ "that's" => "that is",
41
+ "that'll" => "that will",
42
+ "that'd" => "that would",
43
+ "that's" => "that has",
44
+ "that'd" => "that had",
45
+ "who's" => "who is",
46
+ "who'll" => "who will",
47
+ "who'd" => "who would",
48
+ "who's" => "who has",
49
+ "who'd" => "who had",
50
+ "what's" => "what is",
51
+ "what're" => "what are",
52
+ "what'll" => "what will",
53
+ "what'd" => "what would",
54
+ "what's" => "what has",
55
+ "what'd" => "what had",
56
+ "where's" => "where is",
57
+ "where'll" => "where will",
58
+ "where'd" => "where would",
59
+ "where's" => "where has",
60
+ "where'd" => "where had",
61
+ "when's" => "when is",
62
+ "when'll" => "when will",
63
+ "when'd" => "when would",
64
+ "when's" => "when has",
65
+ "when'd" => "when had",
66
+ "why's" => "why is",
67
+ "why'll" => "why will",
68
+ "why'd" => "why would",
69
+ "why's" => "why has",
70
+ "why'd" => "why had",
71
+ "how's" => "how is",
72
+ "how'll" => "how will",
73
+ "how'd" => "how would",
74
+ "how's" => "how has",
75
+ "how'd" => "how had",
76
+ "she'd've" => "she would have",
77
+ "'tisn't" => "it is not",
78
+ "isn't" => "is not",
79
+ "aren't" => "are not",
80
+ "wasn't" => "was not",
81
+ "weren't" => "were not",
82
+ "haven't" => "have not",
83
+ "hasn't" => "has not",
84
+ "hadn't" => "had not",
85
+ "won't" => "will not",
86
+ "wouldn't" => "would not",
87
+ "don't" => "do not",
88
+ "doesn't" => "does not",
89
+ "didn't" => "did not",
90
+ "can't" => "cannot",
91
+ "couldn't" => "could not",
92
+ "shouldn't" => "should not",
93
+ "mightn't" => "might not",
94
+ "mustn't" => "must not",
95
+ "would've" => "would have",
96
+ "should've" => "should have",
97
+ "could've" => "could have",
98
+ "might've" => "might have",
99
+ "must've" => "must have",
100
+ "o'" => "of",
101
+ "o'clock" => "of the clock",
102
+ "ma'am" => "madam",
103
+ "ne'er-do-well" => "never-do-well",
104
+ "cat-o'-nine-tails" => "cat-of-nine-tails",
105
+ "jack-o'-lantern" => "jack-of-the-lantern",
106
+ "will-o'-the-wisp" => "will-of-the-wisp",
107
+ "'twas" => "it was"
108
+ }
109
+
110
+ attr_reader :token, :following_token, :tgr, :hyphen
111
+ def initialize(token:, following_token:, tgr:, **args)
112
+ @token = token
113
+ @following_token = following_token
114
+ @tgr = tgr
115
+ @hyphen = args[:hyphen] || 'count_as_one'
116
+ end
117
+
118
+ def contraction?
119
+ common_contraction? ||
120
+ (apostrophe_s_token? &&
121
+ following_is_not_a_noun?)
122
+ end
123
+
124
+ def expanded_count
125
+ if self.contraction?
126
+ if common_contraction?
127
+ calculate_contraction_length
128
+ else
129
+ 2
130
+ end
131
+ else
132
+ 1
133
+ end
134
+ end
135
+
136
+ def replace
137
+ if CONTRACTIONS.has_key?(token.downcase)
138
+ CONTRACTIONS[token.downcase]
139
+ elsif apostrophe_s_token? && following_is_not_a_noun?
140
+ ' word word '
141
+ else
142
+ token
143
+ end
144
+ end
145
+
146
+ private
147
+
148
+ def calculate_contraction_length
149
+ if hyphen.eql?('count_as_one') && hyphen
150
+ contraction_length
151
+ else
152
+ contraction_length_hyphen
153
+ end
154
+ end
155
+
156
+ def contraction_length
157
+ CONTRACTIONS[token.downcase].split(' ').length
158
+ end
159
+
160
+ def contraction_length_hyphen
161
+ CONTRACTIONS[token.downcase].split(' ').map { |token| token.split('-') }.flatten.length
162
+ end
163
+
164
+ def common_contraction?
165
+ CONTRACTIONS.has_key?(token.downcase)
166
+ end
167
+
168
+ def following_is_not_a_noun?
169
+ !tgr.add_tags(following_token)[1].downcase.eql?('n')
170
+ end
171
+
172
+ def apostrophe_s_token?
173
+ token.include?("'s")
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,230 @@
1
+ module WordCountAnalyzer
2
+ class Counter
3
+ attr_reader :text, :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :tgr
4
+ def initialize(text:, **args)
5
+ @text = text
6
+ @ellipsis = args[:ellipsis] || 'ignore'
7
+ @hyperlink = args[:hyperlink] || 'count_as_one'
8
+ @contraction = args[:contraction] || 'count_as_one'
9
+ @hyphenated_word = args[:hyphenated_word] || 'count_as_one'
10
+ @date = args[:date] || 'no_special_treatment'
11
+ @number = args[:number] || 'count'
12
+ @numbered_list = args[:numbered_list] || 'count'
13
+ @xhtml = args[:xhtml] || 'remove'
14
+ @forward_slash = args[:forward_slash] || 'count_as_multiple_except_dates'
15
+ @backslash = args[:backslash] || 'count_as_one'
16
+ @dotted_line = args[:dotted_line] || 'ignore'
17
+ @dashed_line = args[:dashed_line] || 'ignore'
18
+ @underscore = args[:underscore] || 'ignore'
19
+ @stray_punctuation = args[:stray_punctuation] || 'ignore'
20
+ @tgr = EngTagger.new
21
+ end
22
+
23
+ def count
24
+ word_count
25
+ end
26
+
27
+ def pages_count
28
+ @ellipsis = 'no_special_treatment'
29
+ @hyperlink = 'split_at_period'
30
+ @contraction = 'count_as_one'
31
+ @hyphenated_word = 'count_as_multiple'
32
+ @date = 'no_special_treatment'
33
+ @number = 'count'
34
+ @numbered_list = 'count'
35
+ @xhtml = 'keep'
36
+ @forward_slash = 'count_as_multiple'
37
+ @backslash = 'count_as_multiple'
38
+ @dotted_line = 'ignore'
39
+ @dashed_line = 'ignore'
40
+ @underscore = 'ignore'
41
+ @stray_punctuation = 'ignore'
42
+ word_count
43
+ end
44
+
45
+ def mword_count
46
+ @ellipsis = 'no_special_treatment'
47
+ @hyperlink = 'count_as_one'
48
+ @contraction = 'count_as_one'
49
+ @hyphenated_word = 'count_as_one'
50
+ @date = 'no_special_treatment'
51
+ @number = 'count'
52
+ @numbered_list = 'count'
53
+ @xhtml = 'keep'
54
+ @forward_slash = 'count_as_one'
55
+ @backslash = 'count_as_one'
56
+ @dotted_line = 'count'
57
+ @dashed_line = 'count'
58
+ @underscore = 'count'
59
+ @stray_punctuation = 'count'
60
+ word_count
61
+ end
62
+
63
+ private
64
+
65
+ def word_count
66
+ processed_text = process_ellipsis(text)
67
+ processed_text = process_hyperlink(processed_text)
68
+ processed_text = process_contraction(processed_text)
69
+ processed_text = process_date(processed_text)
70
+ processed_text = process_number(processed_text)
71
+ processed_text = process_number_list(processed_text)
72
+ processed_text = process_xhtml(processed_text)
73
+ processed_text = process_forward_slash(processed_text)
74
+ processed_text = process_backslash(processed_text)
75
+ processed_text = process_hyphenated_word(processed_text)
76
+ processed_text = process_dotted_line(processed_text)
77
+ processed_text = process_dashed_line(processed_text)
78
+ processed_text = process_underscore(processed_text)
79
+ processed_text = process_stray_punctuation(processed_text)
80
+ processed_text.split(/\s+/).reject(&:empty?).size
81
+ end
82
+
83
+ def process_ellipsis(txt)
84
+ if ellipsis.eql?('ignore')
85
+ WordCountAnalyzer::Ellipsis.new(string: txt).replace.gsub(/wseword/, '')
86
+ elsif ellipsis.eql?('no_special_treatment')
87
+ txt
88
+ else
89
+ raise 'The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`'
90
+ end
91
+ end
92
+
93
+ def process_hyperlink(txt)
94
+ case
95
+ when hyperlink.eql?('count_as_one')
96
+ WordCountAnalyzer::Hyperlink.new(string: txt).replace
97
+ when hyperlink.eql?('split_at_period')
98
+ WordCountAnalyzer::Hyperlink.new(string: txt).replace_split_at_period
99
+ when hyperlink.eql?('no_special_treatment')
100
+ txt
101
+ else
102
+ raise 'The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`'
103
+ end
104
+ end
105
+
106
+ def process_contraction(txt)
107
+ if contraction.eql?('count_as_one')
108
+ txt
109
+ elsif contraction.eql?('count_as_multiple')
110
+ array = txt.split(/\s+/)
111
+ array.each_with_index.map { |token, i| WordCountAnalyzer::Contraction.new(token: token, following_token: array[i +1], tgr: tgr).replace }.join(' ')
112
+ else
113
+ raise 'The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
114
+ end
115
+ end
116
+
117
+ def process_hyphenated_word(txt)
118
+ if hyphenated_word.eql?('count_as_one')
119
+ txt
120
+ elsif hyphenated_word.eql?('count_as_multiple')
121
+ txt.split(/\s+/).each_with_index.map { |token, i| WordCountAnalyzer::HyphenatedWord.new(token: token).replace }.join(' ')
122
+ else
123
+ raise 'The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
124
+ end
125
+ end
126
+
127
+ def process_date(txt)
128
+ if date.eql?('no_special_treatment')
129
+ txt
130
+ elsif date.eql?('count_as_one')
131
+ WordCountAnalyzer::Date.new(string: txt).replace
132
+ else
133
+ raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`'
134
+ end
135
+ end
136
+
137
+ def process_number(txt)
138
+ if number.eql?('ignore')
139
+ WordCountAnalyzer::Number.new(string: txt).replace.gsub(/wsnumword/, '')
140
+ elsif number.eql?('count')
141
+ txt
142
+ else
143
+ raise 'The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
144
+ end
145
+ end
146
+
147
+ def process_number_list(txt)
148
+ if numbered_list.eql?('ignore')
149
+ WordCountAnalyzer::NumberedList.new(string: txt).replace
150
+ elsif numbered_list.eql?('count')
151
+ txt
152
+ else
153
+ raise 'The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
154
+ end
155
+ end
156
+
157
+ def process_xhtml(txt)
158
+ if xhtml.eql?('remove')
159
+ WordCountAnalyzer::Xhtml.new(string: txt).replace
160
+ elsif xhtml.eql?('keep')
161
+ txt
162
+ else
163
+ raise 'The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`'
164
+ end
165
+ end
166
+
167
+ def process_forward_slash(txt)
168
+ case
169
+ when forward_slash.eql?('count_as_multiple')
170
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes
171
+ when forward_slash.eql?('count_as_multiple_except_dates')
172
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes_except_dates
173
+ when forward_slash.eql?('count_as_one')
174
+ txt
175
+ else
176
+ raise 'The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`'
177
+ end
178
+ end
179
+
180
+ def process_backslash(txt)
181
+ if backslash.eql?('count_as_multiple')
182
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_backslashes
183
+ elsif backslash.eql?('count_as_one')
184
+ txt
185
+ else
186
+ raise 'The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
187
+ end
188
+ end
189
+
190
+ def process_dotted_line(txt)
191
+ if dotted_line.eql?('ignore')
192
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_dotted_line
193
+ elsif dotted_line.eql?('count')
194
+ txt
195
+ else
196
+ raise 'The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
197
+ end
198
+ end
199
+
200
+ def process_dashed_line(txt)
201
+ if dashed_line.eql?('ignore')
202
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_dashed_line
203
+ elsif dashed_line.eql?('count')
204
+ txt
205
+ else
206
+ raise 'The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
207
+ end
208
+ end
209
+
210
+ def process_underscore(txt)
211
+ if underscore.eql?('ignore')
212
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_underscore
213
+ elsif underscore.eql?('count')
214
+ txt
215
+ else
216
+ raise 'The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
217
+ end
218
+ end
219
+
220
+ def process_stray_punctuation(txt)
221
+ if stray_punctuation.eql?('ignore')
222
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_stray_punctuation
223
+ elsif stray_punctuation.eql?('count')
224
+ txt
225
+ else
226
+ raise 'The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
227
+ end
228
+ end
229
+ end
230
+ end