word_count_analyzer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +554 -0
- data/Rakefile +2 -0
- data/lib/word_count_analyzer.rb +14 -0
- data/lib/word_count_analyzer/analyzer.rb +34 -0
- data/lib/word_count_analyzer/contraction.rb +176 -0
- data/lib/word_count_analyzer/counter.rb +230 -0
- data/lib/word_count_analyzer/date.rb +149 -0
- data/lib/word_count_analyzer/ellipsis.rb +48 -0
- data/lib/word_count_analyzer/hyperlink.rb +53 -0
- data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
- data/lib/word_count_analyzer/number.rb +23 -0
- data/lib/word_count_analyzer/numbered_list.rb +61 -0
- data/lib/word_count_analyzer/punctuation.rb +52 -0
- data/lib/word_count_analyzer/slash.rb +84 -0
- data/lib/word_count_analyzer/version.rb +3 -0
- data/lib/word_count_analyzer/xhtml.rb +26 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
- data/spec/word_count_analyzer/contraction_spec.rb +124 -0
- data/spec/word_count_analyzer/counter_spec.rb +647 -0
- data/spec/word_count_analyzer/date_spec.rb +257 -0
- data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
- data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
- data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
- data/spec/word_count_analyzer/number_spec.rb +63 -0
- data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
- data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
- data/spec/word_count_analyzer/slash_spec.rb +105 -0
- data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
- data/word_count_analyzer.gemspec +26 -0
- metadata +153 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Date
|
3
|
+
DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
|
4
|
+
DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
5
|
+
MONTHS = %w(january february march april may june july august september october november december)
|
6
|
+
MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
7
|
+
# Rubular: http://rubular.com/r/73CZ2HU0q6
|
8
|
+
DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}\.?/
|
9
|
+
|
10
|
+
# Rubular: http://rubular.com/r/GWbuWXw4t0
|
11
|
+
YMD_YDM_REGEX = /\d{4}(\/|\.|-)(\d{1,2}(\/|\.|-)){2}\.?/
|
12
|
+
|
13
|
+
# Rubular: http://rubular.com/r/SRZ27XNlvR
|
14
|
+
DIGIT_ONLY_YEAR_FIRST_REGEX = /[12]\d{7}\D\.?/
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
17
|
+
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/
|
18
|
+
|
19
|
+
attr_reader :string
|
20
|
+
def initialize(string:)
|
21
|
+
@string = string
|
22
|
+
end
|
23
|
+
|
24
|
+
def includes_date?
|
25
|
+
long_date || number_only_date
|
26
|
+
end
|
27
|
+
|
28
|
+
def replace
|
29
|
+
new_string = string.dup
|
30
|
+
counter = 0
|
31
|
+
DOW_ABBR.each do |day|
|
32
|
+
counter +=1 if string.include?('day')
|
33
|
+
end
|
34
|
+
if counter > 0
|
35
|
+
DOW_ABBR.each do |day|
|
36
|
+
MONTHS.each do |month|
|
37
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
38
|
+
end
|
39
|
+
MONTH_ABBR.each do |month|
|
40
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
DOW.each do |day|
|
44
|
+
MONTHS.each do |month|
|
45
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
46
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
47
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
48
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
49
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
50
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
51
|
+
end
|
52
|
+
MONTH_ABBR.each do |month|
|
53
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
54
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
55
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
56
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
57
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
58
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
else
|
62
|
+
DOW.each do |day|
|
63
|
+
MONTHS.each do |month|
|
64
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
65
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
66
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
67
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
68
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
69
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
70
|
+
end
|
71
|
+
MONTH_ABBR.each do |month|
|
72
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
73
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
74
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
75
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
76
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
77
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
78
|
+
end
|
79
|
+
end
|
80
|
+
DOW_ABBR.each do |day|
|
81
|
+
MONTHS.each do |month|
|
82
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
83
|
+
end
|
84
|
+
MONTH_ABBR.each do |month|
|
85
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
new_string = new_string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
90
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
91
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
92
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
93
|
+
end
|
94
|
+
|
95
|
+
def occurences
|
96
|
+
replace.scan(/wsdateword/).size
|
97
|
+
end
|
98
|
+
|
99
|
+
def replace_number_only_date
|
100
|
+
string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
101
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
102
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
103
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
def long_date
|
109
|
+
match_found = false
|
110
|
+
DOW.each do |day|
|
111
|
+
MONTHS.each do |month|
|
112
|
+
break if match_found
|
113
|
+
match_found = check_for_matches(day, month)
|
114
|
+
end
|
115
|
+
MONTH_ABBR.each do |month|
|
116
|
+
break if match_found
|
117
|
+
match_found = check_for_matches(day, month)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
DOW_ABBR.each do |day|
|
121
|
+
MONTHS.each do |month|
|
122
|
+
break if match_found
|
123
|
+
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}/i)
|
124
|
+
end
|
125
|
+
MONTH_ABBR.each do |month|
|
126
|
+
break if match_found
|
127
|
+
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
match_found
|
131
|
+
end
|
132
|
+
|
133
|
+
def number_only_date
|
134
|
+
!(string !~ DMY_MDY_REGEX) ||
|
135
|
+
!(string !~ YMD_YDM_REGEX) ||
|
136
|
+
!(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
137
|
+
!(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
138
|
+
end
|
139
|
+
|
140
|
+
def check_for_matches(day, month)
|
141
|
+
!(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i) ||
|
142
|
+
!(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i) ||
|
143
|
+
!(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*/i) ||
|
144
|
+
!(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
145
|
+
!(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*/i) ||
|
146
|
+
!(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Ellipsis
|
3
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
4
|
+
THREE_CONSECUTIVE_REGEX = /\.{3}(?=\s+[A-Z])/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/mfdtSeuIf2
|
7
|
+
FOUR_CONSECUTIVE_REGEX = /(?<=[^\.])\.{3}\.(?=[^\.])/
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
10
|
+
THREE_SPACE_REGEX = /(\s\.){3}\s/
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
13
|
+
FOUR_SPACE_REGEX = /(?<=[a-z])(\.\s){3}\.(\z|$|\n)/
|
14
|
+
|
15
|
+
OTHER_THREE_PERIOD_REGEX = /[^\.]\.{3}[^\.]/
|
16
|
+
|
17
|
+
UNICODE_ELLIPSIS = /(?<=[^…])…{1}(?=[^…])/
|
18
|
+
|
19
|
+
attr_reader :string
|
20
|
+
def initialize(string:)
|
21
|
+
@string = string
|
22
|
+
end
|
23
|
+
|
24
|
+
def includes_ellipsis?
|
25
|
+
!(string !~ THREE_CONSECUTIVE_REGEX) ||
|
26
|
+
!(string !~ FOUR_CONSECUTIVE_REGEX) ||
|
27
|
+
!(string !~ THREE_SPACE_REGEX) ||
|
28
|
+
!(string !~ FOUR_SPACE_REGEX) ||
|
29
|
+
!(string !~ OTHER_THREE_PERIOD_REGEX) ||
|
30
|
+
!(string !~ UNICODE_ELLIPSIS)
|
31
|
+
end
|
32
|
+
|
33
|
+
def replace
|
34
|
+
string.gsub(THREE_CONSECUTIVE_REGEX, ' wseword ')
|
35
|
+
.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
36
|
+
.gsub(THREE_SPACE_REGEX, ' wseword ')
|
37
|
+
.gsub(FOUR_SPACE_REGEX, ' wseword ')
|
38
|
+
.gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ')
|
39
|
+
.gsub(UNICODE_ELLIPSIS, ' wseword ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def occurences
|
43
|
+
count = 0
|
44
|
+
replace.split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
45
|
+
count
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Hyperlink
|
3
|
+
NON_HYPERLINK_REGEX = /\A\w+:$/
|
4
|
+
|
5
|
+
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
6
|
+
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
7
|
+
|
8
|
+
attr_reader :string
|
9
|
+
def initialize(string:)
|
10
|
+
@string = string
|
11
|
+
end
|
12
|
+
|
13
|
+
def hyperlink?
|
14
|
+
!(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
|
15
|
+
end
|
16
|
+
|
17
|
+
def occurences
|
18
|
+
counter = 0
|
19
|
+
string.scan(URI.regexp).each do |link|
|
20
|
+
counter += 1 if link.compact.size > 1
|
21
|
+
end
|
22
|
+
counter
|
23
|
+
end
|
24
|
+
|
25
|
+
def replace
|
26
|
+
new_string = string.dup
|
27
|
+
string.split(/\s+/).each do |token|
|
28
|
+
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
29
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ')
|
30
|
+
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
31
|
+
new_string = new_string.gsub(/#{Regexp.escape(token)}/, ' wslinkword ')
|
32
|
+
end
|
33
|
+
end
|
34
|
+
new_string
|
35
|
+
end
|
36
|
+
|
37
|
+
def replace_split_at_period
|
38
|
+
new_string = string.dup
|
39
|
+
string.split(/\s+/).each do |token|
|
40
|
+
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
41
|
+
new_string.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match|
|
42
|
+
match.split('.').join(' ')
|
43
|
+
end
|
44
|
+
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
45
|
+
new_string.gsub!(/#{Regexp.escape(token)}/) do |match|
|
46
|
+
match.split('.').join(' ')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
new_string
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class HyphenatedWord
|
3
|
+
# Rubular: http://rubular.com/r/RjZ7qi0uFf
|
4
|
+
DASHED_LINE_REGEX = /\s-{2,}(\s|$)|\A-{2,}(\s|$)/
|
5
|
+
|
6
|
+
attr_reader :token
|
7
|
+
def initialize(token:)
|
8
|
+
@token = token.gsub(DASHED_LINE_REGEX, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def hyphenated_word?
|
12
|
+
(token.include?('-') || token.include?('﹘')) && !WordCountAnalyzer::Hyperlink.new(string: token).hyperlink?
|
13
|
+
end
|
14
|
+
|
15
|
+
def count_as_multiple
|
16
|
+
token.split(/[﹘,-]/).length
|
17
|
+
end
|
18
|
+
|
19
|
+
def replace
|
20
|
+
token.split(/[﹘,-]/).join(' ')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Number
|
3
|
+
# Rubular: http://rubular.com/r/OGj82uEu8d
|
4
|
+
NUMBER_REGEX = /(?<=\A)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)/
|
5
|
+
|
6
|
+
attr_reader :string
|
7
|
+
def initialize(string:)
|
8
|
+
@string = WordCountAnalyzer::NumberedList.new(string: WordCountAnalyzer::Date.new(string: string).replace).replace
|
9
|
+
end
|
10
|
+
|
11
|
+
def includes_number?
|
12
|
+
!(string !~ NUMBER_REGEX)
|
13
|
+
end
|
14
|
+
|
15
|
+
def replace
|
16
|
+
string.gsub(NUMBER_REGEX, ' wsnumword ')
|
17
|
+
end
|
18
|
+
|
19
|
+
def occurences
|
20
|
+
replace.scan(/wsnumword/).size
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class NumberedList
|
3
|
+
# Rubular: http://rubular.com/r/RKmRH9Y4oO
|
4
|
+
NUMBERED_LIST_REGEX = /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.\)|^\d{1,2}\.\)/
|
5
|
+
|
6
|
+
attr_reader :string
|
7
|
+
def initialize(string:)
|
8
|
+
@string = string
|
9
|
+
end
|
10
|
+
|
11
|
+
def includes_numbered_list?
|
12
|
+
!(string !~ NUMBERED_LIST_REGEX) && has_at_least_two_items?
|
13
|
+
end
|
14
|
+
|
15
|
+
def replace
|
16
|
+
new_string = string.dup
|
17
|
+
list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
|
18
|
+
skips = 0
|
19
|
+
list_array.each_with_index do |a, i|
|
20
|
+
if (a + 1).eql?(list_array[i + 1]) ||
|
21
|
+
(a - 1).eql?(list_array[i - 1]) ||
|
22
|
+
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
23
|
+
(a.eql?(9) && list_array[i + 1].eql?(0))
|
24
|
+
new_string.gsub!(NUMBERED_LIST_REGEX).with_index do |match, index|
|
25
|
+
if i.eql?(index + (i - skips)) && match.chomp('.').eql?(a.to_s)
|
26
|
+
''
|
27
|
+
else
|
28
|
+
match
|
29
|
+
end
|
30
|
+
end
|
31
|
+
else
|
32
|
+
skips +=1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
new_string
|
36
|
+
end
|
37
|
+
|
38
|
+
def occurences
|
39
|
+
count_list_items_in_array
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def has_at_least_two_items?
|
45
|
+
count_list_items_in_array >= 2
|
46
|
+
end
|
47
|
+
|
48
|
+
def count_list_items_in_array
|
49
|
+
list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
|
50
|
+
counter = 0
|
51
|
+
list_array.each_with_index do |a, i|
|
52
|
+
next unless (a + 1).eql?(list_array[i + 1]) ||
|
53
|
+
(a - 1).eql?(list_array[i - 1]) ||
|
54
|
+
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
55
|
+
(a.eql?(9) && list_array[i + 1].eql?(0))
|
56
|
+
counter += 1
|
57
|
+
end
|
58
|
+
counter
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Punctuation
|
3
|
+
# Rubular: http://rubular.com/r/ZVBsZVkiqC
|
4
|
+
DOTTED_LINE_REGEX = /…{2,}|\.{5,}/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/RjZ7qi0uFf
|
7
|
+
DASHED_LINE_REGEX = /(?<=\s)-{2,}(\s|$)|\A-{2,}(?=(\s|$))/
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/hNofimZwdh
|
10
|
+
UNDERSCORE_REGEX = /(?<=\s)_{2,}(\s|$)|\A_{2,}(?=(\s|$))/
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/FexKxGUuIe
|
13
|
+
STRAY_PUNCTUATION_REGEX = /(?<=\s)[[:punct:]](?=(\s|$))|(?<=\s)\|(?=(\s|$))/
|
14
|
+
|
15
|
+
attr_reader :string
|
16
|
+
def initialize(string:)
|
17
|
+
@string = string
|
18
|
+
end
|
19
|
+
|
20
|
+
def dotted_line_ocurrances
|
21
|
+
string.scan(DOTTED_LINE_REGEX).size
|
22
|
+
end
|
23
|
+
|
24
|
+
def dashed_line_ocurrances
|
25
|
+
string.scan(DASHED_LINE_REGEX).size
|
26
|
+
end
|
27
|
+
|
28
|
+
def underscore_ocurrances
|
29
|
+
string.scan(UNDERSCORE_REGEX).size
|
30
|
+
end
|
31
|
+
|
32
|
+
def stray_punctuation_occurences
|
33
|
+
string.scan(STRAY_PUNCTUATION_REGEX).size
|
34
|
+
end
|
35
|
+
|
36
|
+
def replace_dotted_line
|
37
|
+
string.gsub(DOTTED_LINE_REGEX, '')
|
38
|
+
end
|
39
|
+
|
40
|
+
def replace_dashed_line
|
41
|
+
string.gsub(DASHED_LINE_REGEX, '')
|
42
|
+
end
|
43
|
+
|
44
|
+
def replace_underscore
|
45
|
+
string.gsub(UNDERSCORE_REGEX, '')
|
46
|
+
end
|
47
|
+
|
48
|
+
def replace_stray_punctuation
|
49
|
+
string.gsub(STRAY_PUNCTUATION_REGEX, '')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Slash
|
3
|
+
# Rubular: http://rubular.com/r/AqvcH29sgg
|
4
|
+
FORWARD_SLASH_REGEX = /(?<=\s)(\S+\/)+\S+|(?<=\A)(\S+\/)+\S+/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/tuFWtdMs4G
|
7
|
+
BACKSLASH_REGEX = /\S+\\\S+/
|
8
|
+
|
9
|
+
attr_reader :string, :processed_string, :date, :xhtml, :hyperlink
|
10
|
+
def initialize(string:, **args)
|
11
|
+
@string = string
|
12
|
+
@date = args[:date] || nil
|
13
|
+
@xhtml = args[:xhtml] || nil
|
14
|
+
@hyperlink = args[:hyperlink] || nil
|
15
|
+
if date.eql?('no_special_treatment')
|
16
|
+
if xhtml.eql?('keep')
|
17
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
18
|
+
@processed_string = string
|
19
|
+
else
|
20
|
+
@processed_string = WordCountAnalyzer::Hyperlink.new(string: string).replace
|
21
|
+
end
|
22
|
+
else
|
23
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
24
|
+
@processed_string = WordCountAnalyzer::Xhtml.new(string: string).replace
|
25
|
+
else
|
26
|
+
@processed_string = WordCountAnalyzer::Xhtml.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace
|
27
|
+
end
|
28
|
+
end
|
29
|
+
else
|
30
|
+
if xhtml.eql?('keep')
|
31
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
32
|
+
@processed_string = WordCountAnalyzer::Date.new(string: string).replace
|
33
|
+
else
|
34
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace
|
35
|
+
end
|
36
|
+
else
|
37
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
38
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Xhtml.new(string: string).replace).replace
|
39
|
+
else
|
40
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Xhtml.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace).replace
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def includes_forward_slash?
|
47
|
+
!(processed_string !~ FORWARD_SLASH_REGEX)
|
48
|
+
end
|
49
|
+
|
50
|
+
def includes_backslash?
|
51
|
+
!(processed_string !~ BACKSLASH_REGEX)
|
52
|
+
end
|
53
|
+
|
54
|
+
def forward_slash_occurences
|
55
|
+
processed_string.scan(FORWARD_SLASH_REGEX).size
|
56
|
+
end
|
57
|
+
|
58
|
+
def replace_forward_slashes
|
59
|
+
return processed_string if processed_string !~ FORWARD_SLASH_REGEX
|
60
|
+
processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
61
|
+
match.split(/\/+/).join(' ')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def replace_forward_slashes_except_dates
|
66
|
+
return processed_string if processed_string !~ FORWARD_SLASH_REGEX
|
67
|
+
except_date_string = WordCountAnalyzer::Date.new(string: processed_string).replace_number_only_date
|
68
|
+
except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
69
|
+
match.split(/\/+/).join(' ')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def backslash_occurences
|
74
|
+
processed_string.scan(BACKSLASH_REGEX).size
|
75
|
+
end
|
76
|
+
|
77
|
+
def replace_backslashes
|
78
|
+
return processed_string if processed_string !~ BACKSLASH_REGEX
|
79
|
+
processed_string.gsub!(BACKSLASH_REGEX).each do |match|
|
80
|
+
' word ' * match.split(/\\+/).length
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|