word_count_analyzer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +554 -0
- data/Rakefile +2 -0
- data/lib/word_count_analyzer.rb +14 -0
- data/lib/word_count_analyzer/analyzer.rb +34 -0
- data/lib/word_count_analyzer/contraction.rb +176 -0
- data/lib/word_count_analyzer/counter.rb +230 -0
- data/lib/word_count_analyzer/date.rb +149 -0
- data/lib/word_count_analyzer/ellipsis.rb +48 -0
- data/lib/word_count_analyzer/hyperlink.rb +53 -0
- data/lib/word_count_analyzer/hyphenated_word.rb +23 -0
- data/lib/word_count_analyzer/number.rb +23 -0
- data/lib/word_count_analyzer/numbered_list.rb +61 -0
- data/lib/word_count_analyzer/punctuation.rb +52 -0
- data/lib/word_count_analyzer/slash.rb +84 -0
- data/lib/word_count_analyzer/version.rb +3 -0
- data/lib/word_count_analyzer/xhtml.rb +26 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/word_count_analyzer/analyzer_spec.rb +11 -0
- data/spec/word_count_analyzer/contraction_spec.rb +124 -0
- data/spec/word_count_analyzer/counter_spec.rb +647 -0
- data/spec/word_count_analyzer/date_spec.rb +257 -0
- data/spec/word_count_analyzer/ellipsis_spec.rb +69 -0
- data/spec/word_count_analyzer/hyperlink_spec.rb +77 -0
- data/spec/word_count_analyzer/hyphenated_word_spec.rb +81 -0
- data/spec/word_count_analyzer/number_spec.rb +63 -0
- data/spec/word_count_analyzer/numbered_list_spec.rb +69 -0
- data/spec/word_count_analyzer/punctuation_spec.rb +91 -0
- data/spec/word_count_analyzer/slash_spec.rb +105 -0
- data/spec/word_count_analyzer/xhtml_spec.rb +65 -0
- data/word_count_analyzer.gemspec +26 -0
- metadata +153 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Date
|
3
|
+
DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
|
4
|
+
DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
5
|
+
MONTHS = %w(january february march april may june july august september october november december)
|
6
|
+
MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
7
|
+
# Rubular: http://rubular.com/r/73CZ2HU0q6
|
8
|
+
DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}\.?/
|
9
|
+
|
10
|
+
# Rubular: http://rubular.com/r/GWbuWXw4t0
|
11
|
+
YMD_YDM_REGEX = /\d{4}(\/|\.|-)(\d{1,2}(\/|\.|-)){2}\.?/
|
12
|
+
|
13
|
+
# Rubular: http://rubular.com/r/SRZ27XNlvR
|
14
|
+
DIGIT_ONLY_YEAR_FIRST_REGEX = /[12]\d{7}\D\.?/
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
17
|
+
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/
|
18
|
+
|
19
|
+
attr_reader :string
|
20
|
+
def initialize(string:)
|
21
|
+
@string = string
|
22
|
+
end
|
23
|
+
|
24
|
+
def includes_date?
|
25
|
+
long_date || number_only_date
|
26
|
+
end
|
27
|
+
|
28
|
+
def replace
|
29
|
+
new_string = string.dup
|
30
|
+
counter = 0
|
31
|
+
DOW_ABBR.each do |day|
|
32
|
+
counter +=1 if string.include?('day')
|
33
|
+
end
|
34
|
+
if counter > 0
|
35
|
+
DOW_ABBR.each do |day|
|
36
|
+
MONTHS.each do |month|
|
37
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
38
|
+
end
|
39
|
+
MONTH_ABBR.each do |month|
|
40
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
DOW.each do |day|
|
44
|
+
MONTHS.each do |month|
|
45
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
46
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
47
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
48
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
49
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
50
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
51
|
+
end
|
52
|
+
MONTH_ABBR.each do |month|
|
53
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
54
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
55
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
56
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
57
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
58
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
else
|
62
|
+
DOW.each do |day|
|
63
|
+
MONTHS.each do |month|
|
64
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
65
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
66
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
67
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
68
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
69
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
70
|
+
end
|
71
|
+
MONTH_ABBR.each do |month|
|
72
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
73
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
74
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
75
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
76
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*\.?/i, ' wsdateword ')
|
77
|
+
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
78
|
+
end
|
79
|
+
end
|
80
|
+
DOW_ABBR.each do |day|
|
81
|
+
MONTHS.each do |month|
|
82
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
83
|
+
end
|
84
|
+
MONTH_ABBR.each do |month|
|
85
|
+
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
new_string = new_string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
90
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
91
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
92
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
93
|
+
end
|
94
|
+
|
95
|
+
def occurences
|
96
|
+
replace.scan(/wsdateword/).size
|
97
|
+
end
|
98
|
+
|
99
|
+
def replace_number_only_date
|
100
|
+
string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
101
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
102
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
103
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
def long_date
|
109
|
+
match_found = false
|
110
|
+
DOW.each do |day|
|
111
|
+
MONTHS.each do |month|
|
112
|
+
break if match_found
|
113
|
+
match_found = check_for_matches(day, month)
|
114
|
+
end
|
115
|
+
MONTH_ABBR.each do |month|
|
116
|
+
break if match_found
|
117
|
+
match_found = check_for_matches(day, month)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
DOW_ABBR.each do |day|
|
121
|
+
MONTHS.each do |month|
|
122
|
+
break if match_found
|
123
|
+
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th)*(,)*\s\d{4}/i)
|
124
|
+
end
|
125
|
+
MONTH_ABBR.each do |month|
|
126
|
+
break if match_found
|
127
|
+
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
match_found
|
131
|
+
end
|
132
|
+
|
133
|
+
def number_only_date
|
134
|
+
!(string !~ DMY_MDY_REGEX) ||
|
135
|
+
!(string !~ YMD_YDM_REGEX) ||
|
136
|
+
!(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
137
|
+
!(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
138
|
+
end
|
139
|
+
|
140
|
+
def check_for_matches(day, month)
|
141
|
+
!(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i) ||
|
142
|
+
!(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*(,)*\s\d{4}/i) ||
|
143
|
+
!(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th)*/i) ||
|
144
|
+
!(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
145
|
+
!(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th)*/i) ||
|
146
|
+
!(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Ellipsis
|
3
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
4
|
+
THREE_CONSECUTIVE_REGEX = /\.{3}(?=\s+[A-Z])/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/mfdtSeuIf2
|
7
|
+
FOUR_CONSECUTIVE_REGEX = /(?<=[^\.])\.{3}\.(?=[^\.])/
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
10
|
+
THREE_SPACE_REGEX = /(\s\.){3}\s/
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
13
|
+
FOUR_SPACE_REGEX = /(?<=[a-z])(\.\s){3}\.(\z|$|\n)/
|
14
|
+
|
15
|
+
OTHER_THREE_PERIOD_REGEX = /[^\.]\.{3}[^\.]/
|
16
|
+
|
17
|
+
UNICODE_ELLIPSIS = /(?<=[^…])…{1}(?=[^…])/
|
18
|
+
|
19
|
+
attr_reader :string
|
20
|
+
def initialize(string:)
|
21
|
+
@string = string
|
22
|
+
end
|
23
|
+
|
24
|
+
def includes_ellipsis?
|
25
|
+
!(string !~ THREE_CONSECUTIVE_REGEX) ||
|
26
|
+
!(string !~ FOUR_CONSECUTIVE_REGEX) ||
|
27
|
+
!(string !~ THREE_SPACE_REGEX) ||
|
28
|
+
!(string !~ FOUR_SPACE_REGEX) ||
|
29
|
+
!(string !~ OTHER_THREE_PERIOD_REGEX) ||
|
30
|
+
!(string !~ UNICODE_ELLIPSIS)
|
31
|
+
end
|
32
|
+
|
33
|
+
def replace
|
34
|
+
string.gsub(THREE_CONSECUTIVE_REGEX, ' wseword ')
|
35
|
+
.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
36
|
+
.gsub(THREE_SPACE_REGEX, ' wseword ')
|
37
|
+
.gsub(FOUR_SPACE_REGEX, ' wseword ')
|
38
|
+
.gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ')
|
39
|
+
.gsub(UNICODE_ELLIPSIS, ' wseword ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def occurences
|
43
|
+
count = 0
|
44
|
+
replace.split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
45
|
+
count
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Hyperlink
|
3
|
+
NON_HYPERLINK_REGEX = /\A\w+:$/
|
4
|
+
|
5
|
+
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
6
|
+
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
7
|
+
|
8
|
+
attr_reader :string
|
9
|
+
def initialize(string:)
|
10
|
+
@string = string
|
11
|
+
end
|
12
|
+
|
13
|
+
def hyperlink?
|
14
|
+
!(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
|
15
|
+
end
|
16
|
+
|
17
|
+
def occurences
|
18
|
+
counter = 0
|
19
|
+
string.scan(URI.regexp).each do |link|
|
20
|
+
counter += 1 if link.compact.size > 1
|
21
|
+
end
|
22
|
+
counter
|
23
|
+
end
|
24
|
+
|
25
|
+
def replace
|
26
|
+
new_string = string.dup
|
27
|
+
string.split(/\s+/).each do |token|
|
28
|
+
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
29
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ')
|
30
|
+
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
31
|
+
new_string = new_string.gsub(/#{Regexp.escape(token)}/, ' wslinkword ')
|
32
|
+
end
|
33
|
+
end
|
34
|
+
new_string
|
35
|
+
end
|
36
|
+
|
37
|
+
def replace_split_at_period
|
38
|
+
new_string = string.dup
|
39
|
+
string.split(/\s+/).each do |token|
|
40
|
+
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
41
|
+
new_string.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match|
|
42
|
+
match.split('.').join(' ')
|
43
|
+
end
|
44
|
+
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
45
|
+
new_string.gsub!(/#{Regexp.escape(token)}/) do |match|
|
46
|
+
match.split('.').join(' ')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
new_string
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class HyphenatedWord
|
3
|
+
# Rubular: http://rubular.com/r/RjZ7qi0uFf
|
4
|
+
DASHED_LINE_REGEX = /\s-{2,}(\s|$)|\A-{2,}(\s|$)/
|
5
|
+
|
6
|
+
attr_reader :token
|
7
|
+
def initialize(token:)
|
8
|
+
@token = token.gsub(DASHED_LINE_REGEX, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def hyphenated_word?
|
12
|
+
(token.include?('-') || token.include?('﹘')) && !WordCountAnalyzer::Hyperlink.new(string: token).hyperlink?
|
13
|
+
end
|
14
|
+
|
15
|
+
def count_as_multiple
|
16
|
+
token.split(/[﹘,-]/).length
|
17
|
+
end
|
18
|
+
|
19
|
+
def replace
|
20
|
+
token.split(/[﹘,-]/).join(' ')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Number
|
3
|
+
# Rubular: http://rubular.com/r/OGj82uEu8d
|
4
|
+
NUMBER_REGEX = /(?<=\A)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)/
|
5
|
+
|
6
|
+
attr_reader :string
|
7
|
+
def initialize(string:)
|
8
|
+
@string = WordCountAnalyzer::NumberedList.new(string: WordCountAnalyzer::Date.new(string: string).replace).replace
|
9
|
+
end
|
10
|
+
|
11
|
+
def includes_number?
|
12
|
+
!(string !~ NUMBER_REGEX)
|
13
|
+
end
|
14
|
+
|
15
|
+
def replace
|
16
|
+
string.gsub(NUMBER_REGEX, ' wsnumword ')
|
17
|
+
end
|
18
|
+
|
19
|
+
def occurences
|
20
|
+
replace.scan(/wsnumword/).size
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class NumberedList
|
3
|
+
# Rubular: http://rubular.com/r/RKmRH9Y4oO
|
4
|
+
NUMBERED_LIST_REGEX = /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.\)|^\d{1,2}\.\)/
|
5
|
+
|
6
|
+
attr_reader :string
|
7
|
+
def initialize(string:)
|
8
|
+
@string = string
|
9
|
+
end
|
10
|
+
|
11
|
+
def includes_numbered_list?
|
12
|
+
!(string !~ NUMBERED_LIST_REGEX) && has_at_least_two_items?
|
13
|
+
end
|
14
|
+
|
15
|
+
def replace
|
16
|
+
new_string = string.dup
|
17
|
+
list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
|
18
|
+
skips = 0
|
19
|
+
list_array.each_with_index do |a, i|
|
20
|
+
if (a + 1).eql?(list_array[i + 1]) ||
|
21
|
+
(a - 1).eql?(list_array[i - 1]) ||
|
22
|
+
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
23
|
+
(a.eql?(9) && list_array[i + 1].eql?(0))
|
24
|
+
new_string.gsub!(NUMBERED_LIST_REGEX).with_index do |match, index|
|
25
|
+
if i.eql?(index + (i - skips)) && match.chomp('.').eql?(a.to_s)
|
26
|
+
''
|
27
|
+
else
|
28
|
+
match
|
29
|
+
end
|
30
|
+
end
|
31
|
+
else
|
32
|
+
skips +=1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
new_string
|
36
|
+
end
|
37
|
+
|
38
|
+
def occurences
|
39
|
+
count_list_items_in_array
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def has_at_least_two_items?
|
45
|
+
count_list_items_in_array >= 2
|
46
|
+
end
|
47
|
+
|
48
|
+
def count_list_items_in_array
|
49
|
+
list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
|
50
|
+
counter = 0
|
51
|
+
list_array.each_with_index do |a, i|
|
52
|
+
next unless (a + 1).eql?(list_array[i + 1]) ||
|
53
|
+
(a - 1).eql?(list_array[i - 1]) ||
|
54
|
+
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
55
|
+
(a.eql?(9) && list_array[i + 1].eql?(0))
|
56
|
+
counter += 1
|
57
|
+
end
|
58
|
+
counter
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Punctuation
|
3
|
+
# Rubular: http://rubular.com/r/ZVBsZVkiqC
|
4
|
+
DOTTED_LINE_REGEX = /…{2,}|\.{5,}/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/RjZ7qi0uFf
|
7
|
+
DASHED_LINE_REGEX = /(?<=\s)-{2,}(\s|$)|\A-{2,}(?=(\s|$))/
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/hNofimZwdh
|
10
|
+
UNDERSCORE_REGEX = /(?<=\s)_{2,}(\s|$)|\A_{2,}(?=(\s|$))/
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/FexKxGUuIe
|
13
|
+
STRAY_PUNCTUATION_REGEX = /(?<=\s)[[:punct:]](?=(\s|$))|(?<=\s)\|(?=(\s|$))/
|
14
|
+
|
15
|
+
attr_reader :string
|
16
|
+
def initialize(string:)
|
17
|
+
@string = string
|
18
|
+
end
|
19
|
+
|
20
|
+
def dotted_line_ocurrances
|
21
|
+
string.scan(DOTTED_LINE_REGEX).size
|
22
|
+
end
|
23
|
+
|
24
|
+
def dashed_line_ocurrances
|
25
|
+
string.scan(DASHED_LINE_REGEX).size
|
26
|
+
end
|
27
|
+
|
28
|
+
def underscore_ocurrances
|
29
|
+
string.scan(UNDERSCORE_REGEX).size
|
30
|
+
end
|
31
|
+
|
32
|
+
def stray_punctuation_occurences
|
33
|
+
string.scan(STRAY_PUNCTUATION_REGEX).size
|
34
|
+
end
|
35
|
+
|
36
|
+
def replace_dotted_line
|
37
|
+
string.gsub(DOTTED_LINE_REGEX, '')
|
38
|
+
end
|
39
|
+
|
40
|
+
def replace_dashed_line
|
41
|
+
string.gsub(DASHED_LINE_REGEX, '')
|
42
|
+
end
|
43
|
+
|
44
|
+
def replace_underscore
|
45
|
+
string.gsub(UNDERSCORE_REGEX, '')
|
46
|
+
end
|
47
|
+
|
48
|
+
def replace_stray_punctuation
|
49
|
+
string.gsub(STRAY_PUNCTUATION_REGEX, '')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module WordCountAnalyzer
|
2
|
+
class Slash
|
3
|
+
# Rubular: http://rubular.com/r/AqvcH29sgg
|
4
|
+
FORWARD_SLASH_REGEX = /(?<=\s)(\S+\/)+\S+|(?<=\A)(\S+\/)+\S+/
|
5
|
+
|
6
|
+
# Rubular: http://rubular.com/r/tuFWtdMs4G
|
7
|
+
BACKSLASH_REGEX = /\S+\\\S+/
|
8
|
+
|
9
|
+
attr_reader :string, :processed_string, :date, :xhtml, :hyperlink
|
10
|
+
def initialize(string:, **args)
|
11
|
+
@string = string
|
12
|
+
@date = args[:date] || nil
|
13
|
+
@xhtml = args[:xhtml] || nil
|
14
|
+
@hyperlink = args[:hyperlink] || nil
|
15
|
+
if date.eql?('no_special_treatment')
|
16
|
+
if xhtml.eql?('keep')
|
17
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
18
|
+
@processed_string = string
|
19
|
+
else
|
20
|
+
@processed_string = WordCountAnalyzer::Hyperlink.new(string: string).replace
|
21
|
+
end
|
22
|
+
else
|
23
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
24
|
+
@processed_string = WordCountAnalyzer::Xhtml.new(string: string).replace
|
25
|
+
else
|
26
|
+
@processed_string = WordCountAnalyzer::Xhtml.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace
|
27
|
+
end
|
28
|
+
end
|
29
|
+
else
|
30
|
+
if xhtml.eql?('keep')
|
31
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
32
|
+
@processed_string = WordCountAnalyzer::Date.new(string: string).replace
|
33
|
+
else
|
34
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace
|
35
|
+
end
|
36
|
+
else
|
37
|
+
if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
|
38
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Xhtml.new(string: string).replace).replace
|
39
|
+
else
|
40
|
+
@processed_string = WordCountAnalyzer::Date.new(string: WordCountAnalyzer::Xhtml.new(string: WordCountAnalyzer::Hyperlink.new(string: string).replace).replace).replace
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def includes_forward_slash?
|
47
|
+
!(processed_string !~ FORWARD_SLASH_REGEX)
|
48
|
+
end
|
49
|
+
|
50
|
+
def includes_backslash?
|
51
|
+
!(processed_string !~ BACKSLASH_REGEX)
|
52
|
+
end
|
53
|
+
|
54
|
+
def forward_slash_occurences
|
55
|
+
processed_string.scan(FORWARD_SLASH_REGEX).size
|
56
|
+
end
|
57
|
+
|
58
|
+
def replace_forward_slashes
|
59
|
+
return processed_string if processed_string !~ FORWARD_SLASH_REGEX
|
60
|
+
processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
61
|
+
match.split(/\/+/).join(' ')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def replace_forward_slashes_except_dates
|
66
|
+
return processed_string if processed_string !~ FORWARD_SLASH_REGEX
|
67
|
+
except_date_string = WordCountAnalyzer::Date.new(string: processed_string).replace_number_only_date
|
68
|
+
except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
|
69
|
+
match.split(/\/+/).join(' ')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def backslash_occurences
|
74
|
+
processed_string.scan(BACKSLASH_REGEX).size
|
75
|
+
end
|
76
|
+
|
77
|
+
def replace_backslashes
|
78
|
+
return processed_string if processed_string !~ BACKSLASH_REGEX
|
79
|
+
processed_string.gsub!(BACKSLASH_REGEX).each do |match|
|
80
|
+
' word ' * match.split(/\\+/).length
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|