word_count_analyzer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +554 -0
  8. data/Rakefile +2 -0
  9. data/lib/word_count_analyzer.rb +14 -0
  10. data/lib/word_count_analyzer/analyzer.rb +34 -0
  11. data/lib/word_count_analyzer/contraction.rb +176 -0
  12. data/lib/word_count_analyzer/counter.rb +230 -0
  13. data/lib/word_count_analyzer/date.rb +149 -0
  14. data/lib/word_count_analyzer/ellipsis.rb +48 -0
  15. data/lib/word_count_analyzer/hyperlink.rb +53 -0
  16. data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
  17. data/lib/word_count_analyzer/number.rb +23 -0
  18. data/lib/word_count_analyzer/numbered_list.rb +61 -0
  19. data/lib/word_count_analyzer/punctuation.rb +52 -0
  20. data/lib/word_count_analyzer/slash.rb +84 -0
  21. data/lib/word_count_analyzer/version.rb +3 -0
  22. data/lib/word_count_analyzer/xhtml.rb +26 -0
  23. data/spec/spec_helper.rb +1 -0
  24. data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
  25. data/spec/word_count_analyzer/contraction_spec.rb +124 -0
  26. data/spec/word_count_analyzer/counter_spec.rb +647 -0
  27. data/spec/word_count_analyzer/date_spec.rb +257 -0
  28. data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
  29. data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
  30. data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
  31. data/spec/word_count_analyzer/number_spec.rb +63 -0
  32. data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
  33. data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
  34. data/spec/word_count_analyzer/slash_spec.rb +105 -0
  35. data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
  36. data/word_count_analyzer.gemspec +26 -0
  37. metadata +153 -0
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,14 @@
1
+ require "word_count_analyzer/version"
2
+ require "word_count_analyzer/analyzer"
3
+ require "word_count_analyzer/counter"
4
+ require "word_count_analyzer/contraction"
5
+ require "word_count_analyzer/hyperlink"
6
+ require "word_count_analyzer/hyphenated_word"
7
+ require "word_count_analyzer/date"
8
+ require "word_count_analyzer/ellipsis"
9
+ require "word_count_analyzer/numbered_list"
10
+ require "word_count_analyzer/xhtml"
11
+ require "word_count_analyzer/number"
12
+ require "word_count_analyzer/slash"
13
+ require "word_count_analyzer/punctuation"
14
+ require "engtagger"
@@ -0,0 +1,34 @@
1
+ module WordCountAnalyzer
2
+ class Analyzer
3
+ attr_reader :text, :tgr
4
+ def initialize(text:)
5
+ @text = text
6
+ @tgr = EngTagger.new
7
+ end
8
+
9
+ def analyze
10
+ analysis = {}
11
+ analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new(string: text).occurences
12
+ contraction_count = 0
13
+ hyphenated_word_count = 0
14
+ WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index|
15
+ contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tgr, hyphen: 'single').contraction?
16
+ hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word?
17
+ end
18
+ analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new(string: text).occurences
19
+ analysis['contraction'] = contraction_count
20
+ analysis['hyphenated_word'] = hyphenated_word_count
21
+ analysis['date'] = WordCountAnalyzer::Date.new(string: text).occurences
22
+ analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurences
23
+ analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurences
24
+ analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurences
25
+ analysis['forward_slash'] = WordCountAnalyzer::Slash.new(string: text).forward_slash_occurences
26
+ analysis['backslash'] = WordCountAnalyzer::Slash.new(string: text).backslash_occurences
27
+ analysis['dotted_line'] = WordCountAnalyzer::Punctuation.new(string: text).dotted_line_ocurrances
28
+ analysis['dashed_line'] = WordCountAnalyzer::Punctuation.new(string: text).dashed_line_ocurrances
29
+ analysis['underscore'] = WordCountAnalyzer::Punctuation.new(string: text).underscore_ocurrances
30
+ analysis['stray_punctuation'] = WordCountAnalyzer::Punctuation.new(string: text).stray_punctuation_occurences
31
+ analysis
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,176 @@
1
+ module WordCountAnalyzer
2
+ class Contraction
3
+ CONTRACTIONS = {
4
+ "i'm" => "I am",
5
+ "i'll" => "I will",
6
+ "i'd" => "I would",
7
+ "i've" => "I have",
8
+ "i'd" => "I had",
9
+ "you're" => "you are",
10
+ "you'll" => "you will",
11
+ "you'd" => "you would",
12
+ "you've" => "you have",
13
+ "you'd" => "you had",
14
+ "he's" => "he is",
15
+ "he'll" => "he will",
16
+ "he'd" => "he would",
17
+ "he's" => "he has",
18
+ "he'd" => "he had",
19
+ "she's" => "she is",
20
+ "she'll" => "she will",
21
+ "she'd" => "she would",
22
+ "she's" => "she has",
23
+ "she'd" => "she had",
24
+ "it's" => "it is",
25
+ "'tis" => "it is",
26
+ "it'll" => "it will",
27
+ "it'd" => "it would",
28
+ "it's" => "it has",
29
+ "it'd" => "it had",
30
+ "we're" => "we are",
31
+ "we'll" => "we will",
32
+ "we'd" => "we would",
33
+ "we've" => "we have",
34
+ "we'd" => "we had",
35
+ "they're" => "they are",
36
+ "they'll" => "they will",
37
+ "they'd" => "they would",
38
+ "they've" => "they have",
39
+ "they'd" => "they had",
40
+ "that's" => "that is",
41
+ "that'll" => "that will",
42
+ "that'd" => "that would",
43
+ "that's" => "that has",
44
+ "that'd" => "that had",
45
+ "who's" => "who is",
46
+ "who'll" => "who will",
47
+ "who'd" => "who would",
48
+ "who's" => "who has",
49
+ "who'd" => "who had",
50
+ "what's" => "what is",
51
+ "what're" => "what are",
52
+ "what'll" => "what will",
53
+ "what'd" => "what would",
54
+ "what's" => "what has",
55
+ "what'd" => "what had",
56
+ "where's" => "where is",
57
+ "where'll" => "where will",
58
+ "where'd" => "where would",
59
+ "where's" => "where has",
60
+ "where'd" => "where had",
61
+ "when's" => "when is",
62
+ "when'll" => "when will",
63
+ "when'd" => "when would",
64
+ "when's" => "when has",
65
+ "when'd" => "when had",
66
+ "why's" => "why is",
67
+ "why'll" => "why will",
68
+ "why'd" => "why would",
69
+ "why's" => "why has",
70
+ "why'd" => "why had",
71
+ "how's" => "how is",
72
+ "how'll" => "how will",
73
+ "how'd" => "how would",
74
+ "how's" => "how has",
75
+ "how'd" => "how had",
76
+ "she'd've" => "she would have",
77
+ "'tisn't" => "it is not",
78
+ "isn't" => "is not",
79
+ "aren't" => "are not",
80
+ "wasn't" => "was not",
81
+ "weren't" => "were not",
82
+ "haven't" => "have not",
83
+ "hasn't" => "has not",
84
+ "hadn't" => "had not",
85
+ "won't" => "will not",
86
+ "wouldn't" => "would not",
87
+ "don't" => "do not",
88
+ "doesn't" => "does not",
89
+ "didn't" => "did not",
90
+ "can't" => "cannot",
91
+ "couldn't" => "could not",
92
+ "shouldn't" => "should not",
93
+ "mightn't" => "might not",
94
+ "mustn't" => "must not",
95
+ "would've" => "would have",
96
+ "should've" => "should have",
97
+ "could've" => "could have",
98
+ "might've" => "might have",
99
+ "must've" => "must have",
100
+ "o'" => "of",
101
+ "o'clock" => "of the clock",
102
+ "ma'am" => "madam",
103
+ "ne'er-do-well" => "never-do-well",
104
+ "cat-o'-nine-tails" => "cat-of-nine-tails",
105
+ "jack-o'-lantern" => "jack-of-the-lantern",
106
+ "will-o'-the-wisp" => "will-of-the-wisp",
107
+ "'twas" => "it was"
108
+ }
109
+
110
+ attr_reader :token, :following_token, :tgr, :hyphen
111
+ def initialize(token:, following_token:, tgr:, **args)
112
+ @token = token
113
+ @following_token = following_token
114
+ @tgr = tgr
115
+ @hyphen = args[:hyphen] || 'count_as_one'
116
+ end
117
+
118
+ def contraction?
119
+ common_contraction? ||
120
+ (apostrophe_s_token? &&
121
+ following_is_not_a_noun?)
122
+ end
123
+
124
+ def expanded_count
125
+ if self.contraction?
126
+ if common_contraction?
127
+ calculate_contraction_length
128
+ else
129
+ 2
130
+ end
131
+ else
132
+ 1
133
+ end
134
+ end
135
+
136
+ def replace
137
+ if CONTRACTIONS.has_key?(token.downcase)
138
+ CONTRACTIONS[token.downcase]
139
+ elsif apostrophe_s_token? && following_is_not_a_noun?
140
+ ' word word '
141
+ else
142
+ token
143
+ end
144
+ end
145
+
146
+ private
147
+
148
+ def calculate_contraction_length
149
+ if hyphen.eql?('count_as_one') && hyphen
150
+ contraction_length
151
+ else
152
+ contraction_length_hyphen
153
+ end
154
+ end
155
+
156
+ def contraction_length
157
+ CONTRACTIONS[token.downcase].split(' ').length
158
+ end
159
+
160
+ def contraction_length_hyphen
161
+ CONTRACTIONS[token.downcase].split(' ').map { |token| token.split('-') }.flatten.length
162
+ end
163
+
164
+ def common_contraction?
165
+ CONTRACTIONS.has_key?(token.downcase)
166
+ end
167
+
168
+ def following_is_not_a_noun?
169
+ !tgr.add_tags(following_token)[1].downcase.eql?('n')
170
+ end
171
+
172
+ def apostrophe_s_token?
173
+ token.include?("'s")
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,230 @@
1
+ module WordCountAnalyzer
2
+ class Counter
3
+ attr_reader :text, :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :tgr
4
+ def initialize(text:, **args)
5
+ @text = text
6
+ @ellipsis = args[:ellipsis] || 'ignore'
7
+ @hyperlink = args[:hyperlink] || 'count_as_one'
8
+ @contraction = args[:contraction] || 'count_as_one'
9
+ @hyphenated_word = args[:hyphenated_word] || 'count_as_one'
10
+ @date = args[:date] || 'no_special_treatment'
11
+ @number = args[:number] || 'count'
12
+ @numbered_list = args[:numbered_list] || 'count'
13
+ @xhtml = args[:xhtml] || 'remove'
14
+ @forward_slash = args[:forward_slash] || 'count_as_multiple_except_dates'
15
+ @backslash = args[:backslash] || 'count_as_one'
16
+ @dotted_line = args[:dotted_line] || 'ignore'
17
+ @dashed_line = args[:dashed_line] || 'ignore'
18
+ @underscore = args[:underscore] || 'ignore'
19
+ @stray_punctuation = args[:stray_punctuation] || 'ignore'
20
+ @tgr = EngTagger.new
21
+ end
22
+
23
+ def count
24
+ word_count
25
+ end
26
+
27
+ def pages_count
28
+ @ellipsis = 'no_special_treatment'
29
+ @hyperlink = 'split_at_period'
30
+ @contraction = 'count_as_one'
31
+ @hyphenated_word = 'count_as_multiple'
32
+ @date = 'no_special_treatment'
33
+ @number = 'count'
34
+ @numbered_list = 'count'
35
+ @xhtml = 'keep'
36
+ @forward_slash = 'count_as_multiple'
37
+ @backslash = 'count_as_multiple'
38
+ @dotted_line = 'ignore'
39
+ @dashed_line = 'ignore'
40
+ @underscore = 'ignore'
41
+ @stray_punctuation = 'ignore'
42
+ word_count
43
+ end
44
+
45
+ def mword_count
46
+ @ellipsis = 'no_special_treatment'
47
+ @hyperlink = 'count_as_one'
48
+ @contraction = 'count_as_one'
49
+ @hyphenated_word = 'count_as_one'
50
+ @date = 'no_special_treatment'
51
+ @number = 'count'
52
+ @numbered_list = 'count'
53
+ @xhtml = 'keep'
54
+ @forward_slash = 'count_as_one'
55
+ @backslash = 'count_as_one'
56
+ @dotted_line = 'count'
57
+ @dashed_line = 'count'
58
+ @underscore = 'count'
59
+ @stray_punctuation = 'count'
60
+ word_count
61
+ end
62
+
63
+ private
64
+
65
+ def word_count
66
+ processed_text = process_ellipsis(text)
67
+ processed_text = process_hyperlink(processed_text)
68
+ processed_text = process_contraction(processed_text)
69
+ processed_text = process_date(processed_text)
70
+ processed_text = process_number(processed_text)
71
+ processed_text = process_number_list(processed_text)
72
+ processed_text = process_xhtml(processed_text)
73
+ processed_text = process_forward_slash(processed_text)
74
+ processed_text = process_backslash(processed_text)
75
+ processed_text = process_hyphenated_word(processed_text)
76
+ processed_text = process_dotted_line(processed_text)
77
+ processed_text = process_dashed_line(processed_text)
78
+ processed_text = process_underscore(processed_text)
79
+ processed_text = process_stray_punctuation(processed_text)
80
+ processed_text.split(/\s+/).reject(&:empty?).size
81
+ end
82
+
83
+ def process_ellipsis(txt)
84
+ if ellipsis.eql?('ignore')
85
+ WordCountAnalyzer::Ellipsis.new(string: txt).replace.gsub(/wseword/, '')
86
+ elsif ellipsis.eql?('no_special_treatment')
87
+ txt
88
+ else
89
+ raise 'The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`'
90
+ end
91
+ end
92
+
93
+ def process_hyperlink(txt)
94
+ case
95
+ when hyperlink.eql?('count_as_one')
96
+ WordCountAnalyzer::Hyperlink.new(string: txt).replace
97
+ when hyperlink.eql?('split_at_period')
98
+ WordCountAnalyzer::Hyperlink.new(string: txt).replace_split_at_period
99
+ when hyperlink.eql?('no_special_treatment')
100
+ txt
101
+ else
102
+ raise 'The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`'
103
+ end
104
+ end
105
+
106
+ def process_contraction(txt)
107
+ if contraction.eql?('count_as_one')
108
+ txt
109
+ elsif contraction.eql?('count_as_multiple')
110
+ array = txt.split(/\s+/)
111
+ array.each_with_index.map { |token, i| WordCountAnalyzer::Contraction.new(token: token, following_token: array[i +1], tgr: tgr).replace }.join(' ')
112
+ else
113
+ raise 'The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
114
+ end
115
+ end
116
+
117
+ def process_hyphenated_word(txt)
118
+ if hyphenated_word.eql?('count_as_one')
119
+ txt
120
+ elsif hyphenated_word.eql?('count_as_multiple')
121
+ txt.split(/\s+/).each_with_index.map { |token, i| WordCountAnalyzer::HyphenatedWord.new(token: token).replace }.join(' ')
122
+ else
123
+ raise 'The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
124
+ end
125
+ end
126
+
127
+ def process_date(txt)
128
+ if date.eql?('no_special_treatment')
129
+ txt
130
+ elsif date.eql?('count_as_one')
131
+ WordCountAnalyzer::Date.new(string: txt).replace
132
+ else
133
+ raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`'
134
+ end
135
+ end
136
+
137
+ def process_number(txt)
138
+ if number.eql?('ignore')
139
+ WordCountAnalyzer::Number.new(string: txt).replace.gsub(/wsnumword/, '')
140
+ elsif number.eql?('count')
141
+ txt
142
+ else
143
+ raise 'The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
144
+ end
145
+ end
146
+
147
+ def process_number_list(txt)
148
+ if numbered_list.eql?('ignore')
149
+ WordCountAnalyzer::NumberedList.new(string: txt).replace
150
+ elsif numbered_list.eql?('count')
151
+ txt
152
+ else
153
+ raise 'The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
154
+ end
155
+ end
156
+
157
+ def process_xhtml(txt)
158
+ if xhtml.eql?('remove')
159
+ WordCountAnalyzer::Xhtml.new(string: txt).replace
160
+ elsif xhtml.eql?('keep')
161
+ txt
162
+ else
163
+ raise 'The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`'
164
+ end
165
+ end
166
+
167
+ def process_forward_slash(txt)
168
+ case
169
+ when forward_slash.eql?('count_as_multiple')
170
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes
171
+ when forward_slash.eql?('count_as_multiple_except_dates')
172
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes_except_dates
173
+ when forward_slash.eql?('count_as_one')
174
+ txt
175
+ else
176
+ raise 'The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`'
177
+ end
178
+ end
179
+
180
+ def process_backslash(txt)
181
+ if backslash.eql?('count_as_multiple')
182
+ WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_backslashes
183
+ elsif backslash.eql?('count_as_one')
184
+ txt
185
+ else
186
+ raise 'The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
187
+ end
188
+ end
189
+
190
+ def process_dotted_line(txt)
191
+ if dotted_line.eql?('ignore')
192
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_dotted_line
193
+ elsif dotted_line.eql?('count')
194
+ txt
195
+ else
196
+ raise 'The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
197
+ end
198
+ end
199
+
200
+ def process_dashed_line(txt)
201
+ if dashed_line.eql?('ignore')
202
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_dashed_line
203
+ elsif dashed_line.eql?('count')
204
+ txt
205
+ else
206
+ raise 'The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
207
+ end
208
+ end
209
+
210
+ def process_underscore(txt)
211
+ if underscore.eql?('ignore')
212
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_underscore
213
+ elsif underscore.eql?('count')
214
+ txt
215
+ else
216
+ raise 'The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
217
+ end
218
+ end
219
+
220
+ def process_stray_punctuation(txt)
221
+ if stray_punctuation.eql?('ignore')
222
+ WordCountAnalyzer::Punctuation.new(string: txt).replace_stray_punctuation
223
+ elsif stray_punctuation.eql?('count')
224
+ txt
225
+ else
226
+ raise 'The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
227
+ end
228
+ end
229
+ end
230
+ end