confidential_info_redactor_lite 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/hyperlink.rb +2 -2
- data/lib/confidential_info_redactor_lite/redactor.rb +10 -8
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +1 -1
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +15 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b37f8797a8ca98088c77462fd3cc34f7bfbb9ef
|
4
|
+
data.tar.gz: ef75c18fada7655ab22d171c1741a9accce73b92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f63e4997217c4e89dd432427d3a3a1d734cf5c2703fbc03343c90444948e41d618b4b4f014c8ce167bec4e5899a70fc20e5adcea14e3eda63f5328dd0c3877b9
|
7
|
+
data.tar.gz: 110de24454372d0572f6a411ca4fd14a9471fb152509d9aecc6707a071c392307bda310ed058b82535755a422e9e59ea15761ca4662f259206ddd9ddea795156
|
@@ -20,9 +20,9 @@ module ConfidentialInfoRedactorLite
|
|
20
20
|
new_string = string.dup
|
21
21
|
string.split(/\s+/).each do |token|
|
22
22
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
|
23
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
24
24
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
|
25
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
26
26
|
end
|
27
27
|
end
|
28
28
|
new_string
|
@@ -9,7 +9,7 @@ module ConfidentialInfoRedactorLite
|
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
12
|
-
attr_reader :text, :language, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
12
|
+
attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
13
13
|
def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
|
14
14
|
@text = text
|
15
15
|
@language = args[:language] || 'en'
|
@@ -17,6 +17,8 @@ module ConfidentialInfoRedactorLite
|
|
17
17
|
@number_text = args[:number_text] || '<redacted number>'
|
18
18
|
@date_text = args[:date_text] || '<redacted date>'
|
19
19
|
@token_text = args[:token_text] || '<redacted>'
|
20
|
+
@email_text = args[:email_text] || '<redacted email>'
|
21
|
+
@hyperlink_text = args[:hyperlink_text] || '<redacted hyperlink>'
|
20
22
|
@ignore_emails = args[:ignore_emails]
|
21
23
|
@ignore_dates = args[:ignore_dates]
|
22
24
|
@ignore_numbers = args[:ignore_numbers]
|
@@ -85,16 +87,16 @@ module ConfidentialInfoRedactorLite
|
|
85
87
|
private
|
86
88
|
|
87
89
|
def redact_hyperlinks_html(txt)
|
88
|
-
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
90
|
+
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
89
91
|
original_sentence_array = txt.split(' ')
|
90
92
|
redacted_sentence_array = redacted_text.split(' ')
|
91
93
|
diff = original_sentence_array - redacted_sentence_array
|
92
94
|
final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
93
|
-
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(
|
95
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(hyperlink_text)}/, "<span class='confidentialHyperlinks'>#{hyperlink_text}</span>"), final_hyperlinks_tokens]
|
94
96
|
end
|
95
97
|
|
96
98
|
def redact_numbers_html(txt)
|
97
|
-
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
99
|
+
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
98
100
|
original_sentence_array = txt.split(' ')
|
99
101
|
redacted_sentence_array = redacted_text.split(' ')
|
100
102
|
diff = original_sentence_array - redacted_sentence_array
|
@@ -103,12 +105,12 @@ module ConfidentialInfoRedactorLite
|
|
103
105
|
end
|
104
106
|
|
105
107
|
def redact_emails_html(txt)
|
106
|
-
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
108
|
+
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
107
109
|
original_sentence_array = txt.split(' ')
|
108
110
|
redacted_sentence_array = redacted_text.split(' ')
|
109
111
|
diff = original_sentence_array - redacted_sentence_array
|
110
112
|
final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
111
|
-
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(
|
113
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(email_text)}/, "<span class='confidentialEmail'>#{email_text}</span>"), final_email_tokens]
|
112
114
|
end
|
113
115
|
|
114
116
|
def redact_dates_html(txt)
|
@@ -155,7 +157,7 @@ module ConfidentialInfoRedactorLite
|
|
155
157
|
end
|
156
158
|
|
157
159
|
def redact_hyperlinks(txt)
|
158
|
-
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{
|
160
|
+
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
|
159
161
|
end
|
160
162
|
|
161
163
|
def redact_dates(txt)
|
@@ -167,7 +169,7 @@ module ConfidentialInfoRedactorLite
|
|
167
169
|
end
|
168
170
|
|
169
171
|
def redact_emails(txt)
|
170
|
-
txt.gsub(EMAIL_REGEX, "#{
|
172
|
+
txt.gsub(EMAIL_REGEX, "#{email_text}")
|
171
173
|
end
|
172
174
|
|
173
175
|
def redact_tokens(txt)
|
@@ -49,7 +49,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
|
49
49
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
50
50
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
51
51
|
ws = described_class.new(string: string)
|
52
|
-
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted> or <redacted> ")
|
52
|
+
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'replaces the hyperlinks in a string with regular tokens #002' do
|
@@ -88,33 +88,33 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
88
88
|
describe '#emails' do
|
89
89
|
it 'redacts email addresses from a text #001' do
|
90
90
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
91
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted> or you can try <redacted>.')
|
91
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted email> or you can try <redacted email>.')
|
92
92
|
end
|
93
93
|
|
94
94
|
it 'redacts email addresses from a text #002' do
|
95
95
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
96
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted>) or you can try (<redacted>).')
|
96
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
100
|
describe '#emails_html' do
|
101
101
|
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
102
102
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
103
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'
|
103
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
104
104
|
end
|
105
105
|
end
|
106
106
|
|
107
107
|
describe '#hyperlinks' do
|
108
108
|
it 'redacts hyperlinks from a text #001' do
|
109
109
|
text = 'Visit https://www.tm-town.com for more info.'
|
110
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted> for more info.')
|
110
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
111
111
|
end
|
112
112
|
end
|
113
113
|
|
114
114
|
describe '#hyperlinks_html' do
|
115
115
|
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
116
116
|
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
117
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
117
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
118
118
|
end
|
119
119
|
end
|
120
120
|
|
@@ -204,19 +204,19 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
204
204
|
it 'redacts all confidential information from a text #003' do
|
205
205
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
206
206
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
207
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.')
|
207
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
208
208
|
end
|
209
209
|
|
210
210
|
it 'redacts all confidential information from a text #004' do
|
211
211
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
212
212
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
213
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.')
|
213
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
214
214
|
end
|
215
215
|
|
216
216
|
it 'redacts all confidential information from a text #005' do
|
217
217
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
218
218
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
219
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
219
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
220
220
|
end
|
221
221
|
end
|
222
222
|
|
@@ -224,7 +224,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
224
224
|
it 'redacts all confidential information from a text #001' do
|
225
225
|
tokens = ['Coca-Cola', 'Pepsi']
|
226
226
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
227
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
227
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'redacts all confidential information from a text #002' do
|
231
|
+
tokens = ['Coca-Cola', 'Pepsi']
|
232
|
+
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
233
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
|
228
234
|
end
|
229
235
|
end
|
230
236
|
end
|