word_count_analyzer 0.0.14 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -6
- data/lib/word_count_analyzer/analyzer.rb +6 -5
- data/lib/word_count_analyzer/contraction.rb +1 -1
- data/lib/word_count_analyzer/counter.rb +15 -16
- data/lib/word_count_analyzer/date.rb +79 -106
- data/lib/word_count_analyzer/ellipsis.rb +10 -15
- data/lib/word_count_analyzer/hyperlink.rb +14 -25
- data/lib/word_count_analyzer/hyphenated_word.rb +1 -1
- data/lib/word_count_analyzer/number.rb +1 -1
- data/lib/word_count_analyzer/slash.rb +8 -7
- data/lib/word_count_analyzer/version.rb +1 -1
- data/spec/word_count_analyzer/counter_spec.rb +123 -160
- data/spec/word_count_analyzer/date_spec.rb +85 -85
- data/spec/word_count_analyzer/ellipsis_spec.rb +33 -33
- data/spec/word_count_analyzer/hyperlink_spec.rb +23 -23
- data/spec/word_count_analyzer/performance_spec.rb +46 -0
- data/word_count_analyzer.gemspec +1 -0
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5a101dde1b0e3db7728e7c17716ee5e4a3201e7
|
4
|
+
data.tar.gz: e16de9a391248d423b88d24c5e2a835a480f8623
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cda1823794f39e1b086e93ae62e10cb8db43f6aab8659988a5f391eb4b9af64c898f8224b7e7a4a0689838885c35078f9be4b639eaf237d0175dcd6183783f19
|
7
|
+
data.tar.gz: a0900c47c0a29afd32169bb35fddd1dbb96685c420def00d0b26f8440a0a8b71ddc9e5badfb8683096d7e6ddf475c9f66f3c023959a926a4e1696593c0398f73
|
data/README.md
CHANGED
@@ -49,7 +49,7 @@ Other gray areas not covered by this gem:
|
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 4. <html> Some HTML and a hyphenated-word</html>. Don't count stray punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
52
|
-
WordCountAnalyzer::Analyzer.new(text
|
52
|
+
WordCountAnalyzer::Analyzer.new.analyze(text)
|
53
53
|
|
54
54
|
# => {
|
55
55
|
# "ellipsis": 1,
|
@@ -74,22 +74,21 @@ WordCountAnalyzer::Analyzer.new(text: text).analyze
|
|
74
74
|
```ruby
|
75
75
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
76
76
|
|
77
|
-
WordCountAnalyzer::Counter.new(text
|
77
|
+
WordCountAnalyzer::Counter.new.count(text)
|
78
78
|
# => 64
|
79
79
|
|
80
80
|
# Overrides all settings to match the way Pages handles word count.
|
81
81
|
# N.B. The developers of Pages may change the algorithm at any time so this should just be as an approximation.
|
82
|
-
WordCountAnalyzer::Counter.new(text
|
82
|
+
WordCountAnalyzer::Counter.new.pages_count(text)
|
83
83
|
# => 76 (or 79 if the list items are not formatted as a list)
|
84
84
|
|
85
85
|
# Overrides all settings to match the way Microsoft Word and wc (Unix) handle word count.
|
86
86
|
# N.B. The developers of these tools may change the algorithm at any time so this should just be as an approximation.
|
87
|
-
WordCountAnalyzer::Counter.new(text
|
87
|
+
WordCountAnalyzer::Counter.new.mword_count(text)
|
88
88
|
# => 71
|
89
89
|
|
90
90
|
# Highly configurable (see all options below)
|
91
91
|
WordCountAnalyzer::Counter.new(
|
92
|
-
text: text,
|
93
92
|
ellipsis: 'no_special_treatment',
|
94
93
|
hyperlink: 'no_special_treatment',
|
95
94
|
contraction: 'count_as_multiple',
|
@@ -104,7 +103,7 @@ WordCountAnalyzer::Counter.new(
|
|
104
103
|
dashed_line: 'count',
|
105
104
|
underscore: 'count',
|
106
105
|
stray_punctuation: 'count'
|
107
|
-
).count
|
106
|
+
).count(text)
|
108
107
|
|
109
108
|
# => 77
|
110
109
|
```
|
@@ -1,23 +1,24 @@
|
|
1
1
|
module WordCountAnalyzer
|
2
2
|
class Analyzer
|
3
|
-
attr_reader :text
|
3
|
+
attr_reader :text, :tagger
|
4
4
|
def initialize(text:)
|
5
5
|
@text = text
|
6
|
+
@tagger = EngTagger.new
|
6
7
|
end
|
7
8
|
|
8
9
|
def analyze
|
9
10
|
analysis = {}
|
10
|
-
analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new(
|
11
|
+
analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new.occurences(text)
|
11
12
|
contraction_count = 0
|
12
13
|
hyphenated_word_count = 0
|
13
14
|
WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index|
|
14
|
-
contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr:
|
15
|
+
contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tagger, hyphen: 'single').contraction?
|
15
16
|
hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word?
|
16
17
|
end
|
17
|
-
analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new(
|
18
|
+
analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new.occurences(text)
|
18
19
|
analysis['contraction'] = contraction_count
|
19
20
|
analysis['hyphenated_word'] = hyphenated_word_count
|
20
|
-
analysis['date'] = WordCountAnalyzer::Date.new(
|
21
|
+
analysis['date'] = WordCountAnalyzer::Date.new.occurences(text)
|
21
22
|
analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurences
|
22
23
|
analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurences
|
23
24
|
analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurences
|
@@ -81,7 +81,7 @@ module WordCountAnalyzer
|
|
81
81
|
"jack-o'-lantern" => "jack-of-the-lantern",
|
82
82
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
83
83
|
"'twas" => "it was"
|
84
|
-
}
|
84
|
+
}.freeze
|
85
85
|
|
86
86
|
attr_reader :token, :following_token, :tgr, :hyphen
|
87
87
|
def initialize(token:, following_token:, tgr:, **args)
|
@@ -1,8 +1,7 @@
|
|
1
1
|
module WordCountAnalyzer
|
2
2
|
class Counter
|
3
|
-
attr_reader :
|
4
|
-
def initialize(
|
5
|
-
@text = text
|
3
|
+
attr_reader :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :equal_sign
|
4
|
+
def initialize(**args)
|
6
5
|
@ellipsis = args[:ellipsis] || 'ignore'
|
7
6
|
@hyperlink = args[:hyperlink] || 'count_as_one'
|
8
7
|
@contraction = args[:contraction] || 'count_as_one'
|
@@ -18,13 +17,14 @@ module WordCountAnalyzer
|
|
18
17
|
@underscore = args[:underscore] || 'ignore'
|
19
18
|
@stray_punctuation = args[:stray_punctuation] || 'ignore'
|
20
19
|
@equal_sign = 'ignore'
|
20
|
+
@tgr = EngTagger.new
|
21
21
|
end
|
22
22
|
|
23
|
-
def count
|
24
|
-
word_count
|
23
|
+
def count(text)
|
24
|
+
word_count(text)
|
25
25
|
end
|
26
26
|
|
27
|
-
def pages_count
|
27
|
+
def pages_count(text)
|
28
28
|
@ellipsis = 'ignore'
|
29
29
|
@hyperlink = 'split_at_period'
|
30
30
|
@contraction = 'count_as_one'
|
@@ -40,10 +40,10 @@ module WordCountAnalyzer
|
|
40
40
|
@underscore = 'ignore'
|
41
41
|
@stray_punctuation = 'ignore'
|
42
42
|
@equal_sign = 'break'
|
43
|
-
word_count
|
43
|
+
word_count(text)
|
44
44
|
end
|
45
45
|
|
46
|
-
def mword_count
|
46
|
+
def mword_count(text)
|
47
47
|
@ellipsis = 'no_special_treatment'
|
48
48
|
@hyperlink = 'count_as_one'
|
49
49
|
@contraction = 'count_as_one'
|
@@ -58,16 +58,15 @@ module WordCountAnalyzer
|
|
58
58
|
@dashed_line = 'count'
|
59
59
|
@underscore = 'count'
|
60
60
|
@stray_punctuation = 'count'
|
61
|
-
word_count
|
61
|
+
word_count(text)
|
62
62
|
end
|
63
63
|
|
64
64
|
private
|
65
65
|
|
66
|
-
def word_count
|
67
|
-
tgr = EngTagger.new
|
66
|
+
def word_count(text)
|
68
67
|
processed_text = process_ellipsis(text)
|
69
68
|
processed_text = process_hyperlink(processed_text)
|
70
|
-
processed_text = process_contraction(processed_text, tgr)
|
69
|
+
processed_text = process_contraction(processed_text, @tgr)
|
71
70
|
processed_text = process_date(processed_text)
|
72
71
|
processed_text = process_number(processed_text)
|
73
72
|
processed_text = process_number_list(processed_text)
|
@@ -85,7 +84,7 @@ module WordCountAnalyzer
|
|
85
84
|
|
86
85
|
def process_ellipsis(txt)
|
87
86
|
if ellipsis.eql?('ignore')
|
88
|
-
WordCountAnalyzer::Ellipsis.new(
|
87
|
+
WordCountAnalyzer::Ellipsis.new.replace(txt).gsub(/wseword/, '')
|
89
88
|
elsif ellipsis.eql?('no_special_treatment')
|
90
89
|
txt
|
91
90
|
else
|
@@ -96,9 +95,9 @@ module WordCountAnalyzer
|
|
96
95
|
def process_hyperlink(txt)
|
97
96
|
case
|
98
97
|
when hyperlink.eql?('count_as_one')
|
99
|
-
WordCountAnalyzer::Hyperlink.new(
|
98
|
+
WordCountAnalyzer::Hyperlink.new.replace(txt)
|
100
99
|
when hyperlink.eql?('split_at_period')
|
101
|
-
WordCountAnalyzer::Hyperlink.new(
|
100
|
+
WordCountAnalyzer::Hyperlink.new.replace_split_at_period(txt)
|
102
101
|
when hyperlink.eql?('no_special_treatment')
|
103
102
|
txt
|
104
103
|
else
|
@@ -131,7 +130,7 @@ module WordCountAnalyzer
|
|
131
130
|
if date.eql?('no_special_treatment')
|
132
131
|
txt
|
133
132
|
elsif date.eql?('count_as_one')
|
134
|
-
WordCountAnalyzer::Date.new(
|
133
|
+
WordCountAnalyzer::Date.new.replace(txt)
|
135
134
|
else
|
136
135
|
raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`'
|
137
136
|
end
|
@@ -16,134 +16,107 @@ module WordCountAnalyzer
|
|
16
16
|
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
17
17
|
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
@string = string
|
19
|
+
def includes_date?(text)
|
20
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
22
21
|
end
|
23
22
|
|
24
|
-
def
|
25
|
-
|
23
|
+
def replace(text)
|
24
|
+
counter = 0
|
25
|
+
DOW_ABBR.map { |day| counter +=1 if text.include?('day') }
|
26
|
+
text = redact_dates(counter, text)
|
27
|
+
redact_regex(text)
|
26
28
|
end
|
27
29
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
def occurences(text)
|
31
|
+
replace(text).scan(/wsdateword/).size
|
32
|
+
end
|
33
|
+
|
34
|
+
def replace_number_only_date(text)
|
35
|
+
text.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
36
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
37
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
38
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def redact_dates(counter, text)
|
34
44
|
if counter > 0
|
35
|
-
|
36
|
-
|
37
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
38
|
-
end
|
39
|
-
MONTH_ABBR.each do |month|
|
40
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
41
|
-
end
|
42
|
-
end
|
43
|
-
DOW.each do |day|
|
44
|
-
MONTHS.each do |month|
|
45
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
46
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
47
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
48
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
49
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
50
|
-
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
51
|
-
end
|
52
|
-
MONTH_ABBR.each do |month|
|
53
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
54
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
55
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
56
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
57
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
58
|
-
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
59
|
-
end
|
60
|
-
end
|
45
|
+
text = redact_dow_abbr(text)
|
46
|
+
text = redact_dow(text)
|
61
47
|
else
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
48
|
+
text = redact_dow(text)
|
49
|
+
text = redact_dow_abbr(text)
|
50
|
+
end
|
51
|
+
text
|
52
|
+
end
|
53
|
+
|
54
|
+
def redact_regex(text)
|
55
|
+
text.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
56
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
57
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
58
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
59
|
+
end
|
60
|
+
|
61
|
+
def redact_dow(text)
|
62
|
+
DOW.each do |day|
|
63
|
+
MONTHS.map { |month| text = redact_date(text, day, month) }
|
64
|
+
MONTH_ABBR.map { |month| text = redact_date(text, day, month) }
|
65
|
+
end
|
66
|
+
text
|
67
|
+
end
|
68
|
+
|
69
|
+
def redact_dow_abbr(text)
|
70
|
+
DOW_ABBR.each do |day|
|
71
|
+
MONTHS.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
|
72
|
+
MONTH_ABBR.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
|
73
|
+
end
|
74
|
+
text
|
75
|
+
end
|
76
|
+
|
77
|
+
def redact_date(text, day, month)
|
78
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
73
79
|
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
74
80
|
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
75
81
|
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
76
82
|
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
77
83
|
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
78
|
-
end
|
79
|
-
end
|
80
|
-
DOW_ABBR.each do |day|
|
81
|
-
MONTHS.each do |month|
|
82
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
83
|
-
end
|
84
|
-
MONTH_ABBR.each do |month|
|
85
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
new_string = new_string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
90
|
-
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
91
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
92
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
93
|
-
end
|
94
|
-
|
95
|
-
def occurences
|
96
|
-
replace.scan(/wsdateword/).size
|
97
84
|
end
|
98
85
|
|
99
|
-
def
|
100
|
-
|
101
|
-
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
102
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
103
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
86
|
+
def includes_long_date?(text)
|
87
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
104
88
|
end
|
105
89
|
|
106
|
-
|
107
|
-
|
108
|
-
def long_date
|
109
|
-
match_found = false
|
90
|
+
def includes_long_date_1?(text)
|
110
91
|
DOW.each do |day|
|
111
|
-
MONTHS.
|
112
|
-
|
113
|
-
match_found = check_for_matches(day, month)
|
114
|
-
end
|
115
|
-
MONTH_ABBR.each do |month|
|
116
|
-
break if match_found
|
117
|
-
match_found = check_for_matches(day, month)
|
118
|
-
end
|
92
|
+
MONTHS.map { |month| return true if check_for_matches(day, month, text) }
|
93
|
+
MONTH_ABBR.map { |month| return true if check_for_matches(day, month, text) }
|
119
94
|
end
|
95
|
+
false
|
96
|
+
end
|
97
|
+
|
98
|
+
def includes_long_date_2?(text)
|
120
99
|
DOW_ABBR.each do |day|
|
121
|
-
MONTHS.
|
122
|
-
|
123
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
124
|
-
end
|
125
|
-
MONTH_ABBR.each do |month|
|
126
|
-
break if match_found
|
127
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
128
|
-
end
|
100
|
+
MONTHS.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
101
|
+
MONTH_ABBR.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
129
102
|
end
|
130
|
-
|
103
|
+
false
|
131
104
|
end
|
132
105
|
|
133
|
-
def
|
134
|
-
!(
|
135
|
-
!(
|
136
|
-
!(
|
137
|
-
!(
|
106
|
+
def includes_number_only_date?(text)
|
107
|
+
!(text !~ DMY_MDY_REGEX) ||
|
108
|
+
!(text !~ YMD_YDM_REGEX) ||
|
109
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
110
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
138
111
|
end
|
139
112
|
|
140
|
-
def check_for_matches(day, month)
|
141
|
-
!(
|
142
|
-
!(
|
143
|
-
!(
|
144
|
-
!(
|
145
|
-
!(
|
146
|
-
!(
|
113
|
+
def check_for_matches(day, month, text)
|
114
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
115
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
116
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
117
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
118
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
119
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
147
120
|
end
|
148
121
|
end
|
149
|
-
end
|
122
|
+
end
|
@@ -13,30 +13,25 @@ module WordCountAnalyzer
|
|
13
13
|
|
14
14
|
UNICODE_ELLIPSIS = /(?<=[^…]|\A)…{1}(?=[^…]|$)/
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
def includes_ellipsis?(text)
|
17
|
+
!(text !~ FOUR_CONSECUTIVE_REGEX) ||
|
18
|
+
!(text !~ THREE_SPACE_REGEX) ||
|
19
|
+
!(text !~ FOUR_SPACE_REGEX) ||
|
20
|
+
!(text !~ OTHER_THREE_PERIOD_REGEX) ||
|
21
|
+
!(text !~ UNICODE_ELLIPSIS)
|
19
22
|
end
|
20
23
|
|
21
|
-
def
|
22
|
-
|
23
|
-
!(string !~ THREE_SPACE_REGEX) ||
|
24
|
-
!(string !~ FOUR_SPACE_REGEX) ||
|
25
|
-
!(string !~ OTHER_THREE_PERIOD_REGEX) ||
|
26
|
-
!(string !~ UNICODE_ELLIPSIS)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace
|
30
|
-
string.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
24
|
+
def replace(text)
|
25
|
+
text.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
31
26
|
.gsub(THREE_SPACE_REGEX, ' wseword ')
|
32
27
|
.gsub(FOUR_SPACE_REGEX, ' wseword ')
|
33
28
|
.gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ')
|
34
29
|
.gsub(UNICODE_ELLIPSIS, ' wseword ')
|
35
30
|
end
|
36
31
|
|
37
|
-
def occurences
|
32
|
+
def occurences(text)
|
38
33
|
count = 0
|
39
|
-
replace.split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
34
|
+
replace(text).split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
40
35
|
count
|
41
36
|
end
|
42
37
|
end
|
@@ -7,49 +7,38 @@ module WordCountAnalyzer
|
|
7
7
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
8
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
@string = string
|
10
|
+
def hyperlink?(text)
|
11
|
+
!(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
|
13
12
|
end
|
14
13
|
|
15
|
-
def
|
16
|
-
|
14
|
+
def occurences(text)
|
15
|
+
text.scan(URI.regexp).map { |link| link.compact.size > 1 ? 1 : 0 }.inject(0) { |sum, x| sum + x }
|
17
16
|
end
|
18
17
|
|
19
|
-
def
|
20
|
-
|
21
|
-
string.scan(URI.regexp).each do |link|
|
22
|
-
counter += 1 if link.compact.size > 1
|
23
|
-
end
|
24
|
-
counter
|
25
|
-
end
|
26
|
-
|
27
|
-
def replace
|
28
|
-
new_string = string.dup
|
29
|
-
string.split(/\s+/).each do |token|
|
18
|
+
def replace(text)
|
19
|
+
text.split(/\s+/).each do |token|
|
30
20
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
31
|
-
|
21
|
+
text = text.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ')
|
32
22
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
33
|
-
|
23
|
+
text = text.gsub(/#{Regexp.escape(token)}/, ' wslinkword ')
|
34
24
|
end
|
35
25
|
end
|
36
|
-
|
26
|
+
text
|
37
27
|
end
|
38
28
|
|
39
|
-
def replace_split_at_period
|
40
|
-
|
41
|
-
string.split(/\s+/).each do |token|
|
29
|
+
def replace_split_at_period(text)
|
30
|
+
text.split(/\s+/).each do |token|
|
42
31
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
43
|
-
|
32
|
+
text.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match|
|
44
33
|
match.split('.').join(' ')
|
45
34
|
end
|
46
35
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
47
|
-
|
36
|
+
text.gsub!(/#{Regexp.escape(token)}/) do |match|
|
48
37
|
match.split('.').join(' ')
|
49
38
|
end
|
50
39
|
end
|
51
40
|
end
|
52
|
-
|
41
|
+
text
|
53
42
|
end
|
54
43
|
end
|
55
44
|
end
|