word_count_analyzer 0.0.14 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -6
- data/lib/word_count_analyzer/analyzer.rb +6 -5
- data/lib/word_count_analyzer/contraction.rb +1 -1
- data/lib/word_count_analyzer/counter.rb +15 -16
- data/lib/word_count_analyzer/date.rb +79 -106
- data/lib/word_count_analyzer/ellipsis.rb +10 -15
- data/lib/word_count_analyzer/hyperlink.rb +14 -25
- data/lib/word_count_analyzer/hyphenated_word.rb +1 -1
- data/lib/word_count_analyzer/number.rb +1 -1
- data/lib/word_count_analyzer/slash.rb +8 -7
- data/lib/word_count_analyzer/version.rb +1 -1
- data/spec/word_count_analyzer/counter_spec.rb +123 -160
- data/spec/word_count_analyzer/date_spec.rb +85 -85
- data/spec/word_count_analyzer/ellipsis_spec.rb +33 -33
- data/spec/word_count_analyzer/hyperlink_spec.rb +23 -23
- data/spec/word_count_analyzer/performance_spec.rb +46 -0
- data/word_count_analyzer.gemspec +1 -0
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5a101dde1b0e3db7728e7c17716ee5e4a3201e7
|
4
|
+
data.tar.gz: e16de9a391248d423b88d24c5e2a835a480f8623
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cda1823794f39e1b086e93ae62e10cb8db43f6aab8659988a5f391eb4b9af64c898f8224b7e7a4a0689838885c35078f9be4b639eaf237d0175dcd6183783f19
|
7
|
+
data.tar.gz: a0900c47c0a29afd32169bb35fddd1dbb96685c420def00d0b26f8440a0a8b71ddc9e5badfb8683096d7e6ddf475c9f66f3c023959a926a4e1696593c0398f73
|
data/README.md
CHANGED
@@ -49,7 +49,7 @@ Other gray areas not covered by this gem:
|
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 4. <html> Some HTML and a hyphenated-word</html>. Don't count stray punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
52
|
-
WordCountAnalyzer::Analyzer.new(text
|
52
|
+
WordCountAnalyzer::Analyzer.new.analyze(text)
|
53
53
|
|
54
54
|
# => {
|
55
55
|
# "ellipsis": 1,
|
@@ -74,22 +74,21 @@ WordCountAnalyzer::Analyzer.new(text: text).analyze
|
|
74
74
|
```ruby
|
75
75
|
text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
|
76
76
|
|
77
|
-
WordCountAnalyzer::Counter.new(text
|
77
|
+
WordCountAnalyzer::Counter.new.count(text)
|
78
78
|
# => 64
|
79
79
|
|
80
80
|
# Overrides all settings to match the way Pages handles word count.
|
81
81
|
# N.B. The developers of Pages may change the algorithm at any time so this should just be as an approximation.
|
82
|
-
WordCountAnalyzer::Counter.new(text
|
82
|
+
WordCountAnalyzer::Counter.new.pages_count(text)
|
83
83
|
# => 76 (or 79 if the list items are not formatted as a list)
|
84
84
|
|
85
85
|
# Overrides all settings to match the way Microsoft Word and wc (Unix) handle word count.
|
86
86
|
# N.B. The developers of these tools may change the algorithm at any time so this should just be as an approximation.
|
87
|
-
WordCountAnalyzer::Counter.new(text
|
87
|
+
WordCountAnalyzer::Counter.new.mword_count(text)
|
88
88
|
# => 71
|
89
89
|
|
90
90
|
# Highly configurable (see all options below)
|
91
91
|
WordCountAnalyzer::Counter.new(
|
92
|
-
text: text,
|
93
92
|
ellipsis: 'no_special_treatment',
|
94
93
|
hyperlink: 'no_special_treatment',
|
95
94
|
contraction: 'count_as_multiple',
|
@@ -104,7 +103,7 @@ WordCountAnalyzer::Counter.new(
|
|
104
103
|
dashed_line: 'count',
|
105
104
|
underscore: 'count',
|
106
105
|
stray_punctuation: 'count'
|
107
|
-
).count
|
106
|
+
).count(text)
|
108
107
|
|
109
108
|
# => 77
|
110
109
|
```
|
@@ -1,23 +1,24 @@
|
|
1
1
|
module WordCountAnalyzer
|
2
2
|
class Analyzer
|
3
|
-
attr_reader :text
|
3
|
+
attr_reader :text, :tagger
|
4
4
|
def initialize(text:)
|
5
5
|
@text = text
|
6
|
+
@tagger = EngTagger.new
|
6
7
|
end
|
7
8
|
|
8
9
|
def analyze
|
9
10
|
analysis = {}
|
10
|
-
analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new(
|
11
|
+
analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new.occurences(text)
|
11
12
|
contraction_count = 0
|
12
13
|
hyphenated_word_count = 0
|
13
14
|
WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index|
|
14
|
-
contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr:
|
15
|
+
contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tagger, hyphen: 'single').contraction?
|
15
16
|
hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word?
|
16
17
|
end
|
17
|
-
analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new(
|
18
|
+
analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new.occurences(text)
|
18
19
|
analysis['contraction'] = contraction_count
|
19
20
|
analysis['hyphenated_word'] = hyphenated_word_count
|
20
|
-
analysis['date'] = WordCountAnalyzer::Date.new(
|
21
|
+
analysis['date'] = WordCountAnalyzer::Date.new.occurences(text)
|
21
22
|
analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurences
|
22
23
|
analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurences
|
23
24
|
analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurences
|
@@ -81,7 +81,7 @@ module WordCountAnalyzer
|
|
81
81
|
"jack-o'-lantern" => "jack-of-the-lantern",
|
82
82
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
83
83
|
"'twas" => "it was"
|
84
|
-
}
|
84
|
+
}.freeze
|
85
85
|
|
86
86
|
attr_reader :token, :following_token, :tgr, :hyphen
|
87
87
|
def initialize(token:, following_token:, tgr:, **args)
|
@@ -1,8 +1,7 @@
|
|
1
1
|
module WordCountAnalyzer
|
2
2
|
class Counter
|
3
|
-
attr_reader :
|
4
|
-
def initialize(
|
5
|
-
@text = text
|
3
|
+
attr_reader :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :equal_sign
|
4
|
+
def initialize(**args)
|
6
5
|
@ellipsis = args[:ellipsis] || 'ignore'
|
7
6
|
@hyperlink = args[:hyperlink] || 'count_as_one'
|
8
7
|
@contraction = args[:contraction] || 'count_as_one'
|
@@ -18,13 +17,14 @@ module WordCountAnalyzer
|
|
18
17
|
@underscore = args[:underscore] || 'ignore'
|
19
18
|
@stray_punctuation = args[:stray_punctuation] || 'ignore'
|
20
19
|
@equal_sign = 'ignore'
|
20
|
+
@tgr = EngTagger.new
|
21
21
|
end
|
22
22
|
|
23
|
-
def count
|
24
|
-
word_count
|
23
|
+
def count(text)
|
24
|
+
word_count(text)
|
25
25
|
end
|
26
26
|
|
27
|
-
def pages_count
|
27
|
+
def pages_count(text)
|
28
28
|
@ellipsis = 'ignore'
|
29
29
|
@hyperlink = 'split_at_period'
|
30
30
|
@contraction = 'count_as_one'
|
@@ -40,10 +40,10 @@ module WordCountAnalyzer
|
|
40
40
|
@underscore = 'ignore'
|
41
41
|
@stray_punctuation = 'ignore'
|
42
42
|
@equal_sign = 'break'
|
43
|
-
word_count
|
43
|
+
word_count(text)
|
44
44
|
end
|
45
45
|
|
46
|
-
def mword_count
|
46
|
+
def mword_count(text)
|
47
47
|
@ellipsis = 'no_special_treatment'
|
48
48
|
@hyperlink = 'count_as_one'
|
49
49
|
@contraction = 'count_as_one'
|
@@ -58,16 +58,15 @@ module WordCountAnalyzer
|
|
58
58
|
@dashed_line = 'count'
|
59
59
|
@underscore = 'count'
|
60
60
|
@stray_punctuation = 'count'
|
61
|
-
word_count
|
61
|
+
word_count(text)
|
62
62
|
end
|
63
63
|
|
64
64
|
private
|
65
65
|
|
66
|
-
def word_count
|
67
|
-
tgr = EngTagger.new
|
66
|
+
def word_count(text)
|
68
67
|
processed_text = process_ellipsis(text)
|
69
68
|
processed_text = process_hyperlink(processed_text)
|
70
|
-
processed_text = process_contraction(processed_text, tgr)
|
69
|
+
processed_text = process_contraction(processed_text, @tgr)
|
71
70
|
processed_text = process_date(processed_text)
|
72
71
|
processed_text = process_number(processed_text)
|
73
72
|
processed_text = process_number_list(processed_text)
|
@@ -85,7 +84,7 @@ module WordCountAnalyzer
|
|
85
84
|
|
86
85
|
def process_ellipsis(txt)
|
87
86
|
if ellipsis.eql?('ignore')
|
88
|
-
WordCountAnalyzer::Ellipsis.new(
|
87
|
+
WordCountAnalyzer::Ellipsis.new.replace(txt).gsub(/wseword/, '')
|
89
88
|
elsif ellipsis.eql?('no_special_treatment')
|
90
89
|
txt
|
91
90
|
else
|
@@ -96,9 +95,9 @@ module WordCountAnalyzer
|
|
96
95
|
def process_hyperlink(txt)
|
97
96
|
case
|
98
97
|
when hyperlink.eql?('count_as_one')
|
99
|
-
WordCountAnalyzer::Hyperlink.new(
|
98
|
+
WordCountAnalyzer::Hyperlink.new.replace(txt)
|
100
99
|
when hyperlink.eql?('split_at_period')
|
101
|
-
WordCountAnalyzer::Hyperlink.new(
|
100
|
+
WordCountAnalyzer::Hyperlink.new.replace_split_at_period(txt)
|
102
101
|
when hyperlink.eql?('no_special_treatment')
|
103
102
|
txt
|
104
103
|
else
|
@@ -131,7 +130,7 @@ module WordCountAnalyzer
|
|
131
130
|
if date.eql?('no_special_treatment')
|
132
131
|
txt
|
133
132
|
elsif date.eql?('count_as_one')
|
134
|
-
WordCountAnalyzer::Date.new(
|
133
|
+
WordCountAnalyzer::Date.new.replace(txt)
|
135
134
|
else
|
136
135
|
raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`'
|
137
136
|
end
|
@@ -16,134 +16,107 @@ module WordCountAnalyzer
|
|
16
16
|
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
17
17
|
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
@string = string
|
19
|
+
def includes_date?(text)
|
20
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
22
21
|
end
|
23
22
|
|
24
|
-
def
|
25
|
-
|
23
|
+
def replace(text)
|
24
|
+
counter = 0
|
25
|
+
DOW_ABBR.map { |day| counter +=1 if text.include?('day') }
|
26
|
+
text = redact_dates(counter, text)
|
27
|
+
redact_regex(text)
|
26
28
|
end
|
27
29
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
def occurences(text)
|
31
|
+
replace(text).scan(/wsdateword/).size
|
32
|
+
end
|
33
|
+
|
34
|
+
def replace_number_only_date(text)
|
35
|
+
text.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
36
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
37
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
38
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def redact_dates(counter, text)
|
34
44
|
if counter > 0
|
35
|
-
|
36
|
-
|
37
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
38
|
-
end
|
39
|
-
MONTH_ABBR.each do |month|
|
40
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
41
|
-
end
|
42
|
-
end
|
43
|
-
DOW.each do |day|
|
44
|
-
MONTHS.each do |month|
|
45
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
46
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
47
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
48
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
49
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
50
|
-
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
51
|
-
end
|
52
|
-
MONTH_ABBR.each do |month|
|
53
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
54
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
55
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
56
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
57
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
58
|
-
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
59
|
-
end
|
60
|
-
end
|
45
|
+
text = redact_dow_abbr(text)
|
46
|
+
text = redact_dow(text)
|
61
47
|
else
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
48
|
+
text = redact_dow(text)
|
49
|
+
text = redact_dow_abbr(text)
|
50
|
+
end
|
51
|
+
text
|
52
|
+
end
|
53
|
+
|
54
|
+
def redact_regex(text)
|
55
|
+
text.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
56
|
+
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
57
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
58
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
59
|
+
end
|
60
|
+
|
61
|
+
def redact_dow(text)
|
62
|
+
DOW.each do |day|
|
63
|
+
MONTHS.map { |month| text = redact_date(text, day, month) }
|
64
|
+
MONTH_ABBR.map { |month| text = redact_date(text, day, month) }
|
65
|
+
end
|
66
|
+
text
|
67
|
+
end
|
68
|
+
|
69
|
+
def redact_dow_abbr(text)
|
70
|
+
DOW_ABBR.each do |day|
|
71
|
+
MONTHS.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
|
72
|
+
MONTH_ABBR.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
|
73
|
+
end
|
74
|
+
text
|
75
|
+
end
|
76
|
+
|
77
|
+
def redact_date(text, day, month)
|
78
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
73
79
|
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
74
80
|
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
75
81
|
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
|
76
82
|
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
|
77
83
|
.gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
|
78
|
-
end
|
79
|
-
end
|
80
|
-
DOW_ABBR.each do |day|
|
81
|
-
MONTHS.each do |month|
|
82
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
83
|
-
end
|
84
|
-
MONTH_ABBR.each do |month|
|
85
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
new_string = new_string.gsub(DMY_MDY_REGEX, ' wsdateword ')
|
90
|
-
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
91
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
92
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
93
|
-
end
|
94
|
-
|
95
|
-
def occurences
|
96
|
-
replace.scan(/wsdateword/).size
|
97
84
|
end
|
98
85
|
|
99
|
-
def
|
100
|
-
|
101
|
-
.gsub(YMD_YDM_REGEX, ' wsdateword ')
|
102
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
|
103
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
|
86
|
+
def includes_long_date?(text)
|
87
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
104
88
|
end
|
105
89
|
|
106
|
-
|
107
|
-
|
108
|
-
def long_date
|
109
|
-
match_found = false
|
90
|
+
def includes_long_date_1?(text)
|
110
91
|
DOW.each do |day|
|
111
|
-
MONTHS.
|
112
|
-
|
113
|
-
match_found = check_for_matches(day, month)
|
114
|
-
end
|
115
|
-
MONTH_ABBR.each do |month|
|
116
|
-
break if match_found
|
117
|
-
match_found = check_for_matches(day, month)
|
118
|
-
end
|
92
|
+
MONTHS.map { |month| return true if check_for_matches(day, month, text) }
|
93
|
+
MONTH_ABBR.map { |month| return true if check_for_matches(day, month, text) }
|
119
94
|
end
|
95
|
+
false
|
96
|
+
end
|
97
|
+
|
98
|
+
def includes_long_date_2?(text)
|
120
99
|
DOW_ABBR.each do |day|
|
121
|
-
MONTHS.
|
122
|
-
|
123
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
124
|
-
end
|
125
|
-
MONTH_ABBR.each do |month|
|
126
|
-
break if match_found
|
127
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
128
|
-
end
|
100
|
+
MONTHS.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
101
|
+
MONTH_ABBR.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
129
102
|
end
|
130
|
-
|
103
|
+
false
|
131
104
|
end
|
132
105
|
|
133
|
-
def
|
134
|
-
!(
|
135
|
-
!(
|
136
|
-
!(
|
137
|
-
!(
|
106
|
+
def includes_number_only_date?(text)
|
107
|
+
!(text !~ DMY_MDY_REGEX) ||
|
108
|
+
!(text !~ YMD_YDM_REGEX) ||
|
109
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
110
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
138
111
|
end
|
139
112
|
|
140
|
-
def check_for_matches(day, month)
|
141
|
-
!(
|
142
|
-
!(
|
143
|
-
!(
|
144
|
-
!(
|
145
|
-
!(
|
146
|
-
!(
|
113
|
+
def check_for_matches(day, month, text)
|
114
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
115
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
116
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
117
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
118
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
119
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
147
120
|
end
|
148
121
|
end
|
149
|
-
end
|
122
|
+
end
|
@@ -13,30 +13,25 @@ module WordCountAnalyzer
|
|
13
13
|
|
14
14
|
UNICODE_ELLIPSIS = /(?<=[^…]|\A)…{1}(?=[^…]|$)/
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
def includes_ellipsis?(text)
|
17
|
+
!(text !~ FOUR_CONSECUTIVE_REGEX) ||
|
18
|
+
!(text !~ THREE_SPACE_REGEX) ||
|
19
|
+
!(text !~ FOUR_SPACE_REGEX) ||
|
20
|
+
!(text !~ OTHER_THREE_PERIOD_REGEX) ||
|
21
|
+
!(text !~ UNICODE_ELLIPSIS)
|
19
22
|
end
|
20
23
|
|
21
|
-
def
|
22
|
-
|
23
|
-
!(string !~ THREE_SPACE_REGEX) ||
|
24
|
-
!(string !~ FOUR_SPACE_REGEX) ||
|
25
|
-
!(string !~ OTHER_THREE_PERIOD_REGEX) ||
|
26
|
-
!(string !~ UNICODE_ELLIPSIS)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace
|
30
|
-
string.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
24
|
+
def replace(text)
|
25
|
+
text.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
|
31
26
|
.gsub(THREE_SPACE_REGEX, ' wseword ')
|
32
27
|
.gsub(FOUR_SPACE_REGEX, ' wseword ')
|
33
28
|
.gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ')
|
34
29
|
.gsub(UNICODE_ELLIPSIS, ' wseword ')
|
35
30
|
end
|
36
31
|
|
37
|
-
def occurences
|
32
|
+
def occurences(text)
|
38
33
|
count = 0
|
39
|
-
replace.split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
34
|
+
replace(text).split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
|
40
35
|
count
|
41
36
|
end
|
42
37
|
end
|
@@ -7,49 +7,38 @@ module WordCountAnalyzer
|
|
7
7
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
8
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
@string = string
|
10
|
+
def hyperlink?(text)
|
11
|
+
!(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
|
13
12
|
end
|
14
13
|
|
15
|
-
def
|
16
|
-
|
14
|
+
def occurences(text)
|
15
|
+
text.scan(URI.regexp).map { |link| link.compact.size > 1 ? 1 : 0 }.inject(0) { |sum, x| sum + x }
|
17
16
|
end
|
18
17
|
|
19
|
-
def
|
20
|
-
|
21
|
-
string.scan(URI.regexp).each do |link|
|
22
|
-
counter += 1 if link.compact.size > 1
|
23
|
-
end
|
24
|
-
counter
|
25
|
-
end
|
26
|
-
|
27
|
-
def replace
|
28
|
-
new_string = string.dup
|
29
|
-
string.split(/\s+/).each do |token|
|
18
|
+
def replace(text)
|
19
|
+
text.split(/\s+/).each do |token|
|
30
20
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
31
|
-
|
21
|
+
text = text.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ')
|
32
22
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
33
|
-
|
23
|
+
text = text.gsub(/#{Regexp.escape(token)}/, ' wslinkword ')
|
34
24
|
end
|
35
25
|
end
|
36
|
-
|
26
|
+
text
|
37
27
|
end
|
38
28
|
|
39
|
-
def replace_split_at_period
|
40
|
-
|
41
|
-
string.split(/\s+/).each do |token|
|
29
|
+
def replace_split_at_period(text)
|
30
|
+
text.split(/\s+/).each do |token|
|
42
31
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
43
|
-
|
32
|
+
text.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match|
|
44
33
|
match.split('.').join(' ')
|
45
34
|
end
|
46
35
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
47
|
-
|
36
|
+
text.gsub!(/#{Regexp.escape(token)}/) do |match|
|
48
37
|
match.split('.').join(' ')
|
49
38
|
end
|
50
39
|
end
|
51
40
|
end
|
52
|
-
|
41
|
+
text
|
53
42
|
end
|
54
43
|
end
|
55
44
|
end
|