confidential_info_redactor_lite 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/hyperlink.rb +2 -2
- data/lib/confidential_info_redactor_lite/redactor.rb +10 -8
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +1 -1
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +15 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b37f8797a8ca98088c77462fd3cc34f7bfbb9ef
|
4
|
+
data.tar.gz: ef75c18fada7655ab22d171c1741a9accce73b92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f63e4997217c4e89dd432427d3a3a1d734cf5c2703fbc03343c90444948e41d618b4b4f014c8ce167bec4e5899a70fc20e5adcea14e3eda63f5328dd0c3877b9
|
7
|
+
data.tar.gz: 110de24454372d0572f6a411ca4fd14a9471fb152509d9aecc6707a071c392307bda310ed058b82535755a422e9e59ea15761ca4662f259206ddd9ddea795156
|
@@ -20,9 +20,9 @@ module ConfidentialInfoRedactorLite
|
|
20
20
|
new_string = string.dup
|
21
21
|
string.split(/\s+/).each do |token|
|
22
22
|
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
|
23
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
24
24
|
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
|
25
|
+
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
|
26
26
|
end
|
27
27
|
end
|
28
28
|
new_string
|
@@ -9,7 +9,7 @@ module ConfidentialInfoRedactorLite
|
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
12
|
-
attr_reader :text, :language, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
12
|
+
attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
13
13
|
def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
|
14
14
|
@text = text
|
15
15
|
@language = args[:language] || 'en'
|
@@ -17,6 +17,8 @@ module ConfidentialInfoRedactorLite
|
|
17
17
|
@number_text = args[:number_text] || '<redacted number>'
|
18
18
|
@date_text = args[:date_text] || '<redacted date>'
|
19
19
|
@token_text = args[:token_text] || '<redacted>'
|
20
|
+
@email_text = args[:email_text] || '<redacted email>'
|
21
|
+
@hyperlink_text = args[:hyperlink_text] || '<redacted hyperlink>'
|
20
22
|
@ignore_emails = args[:ignore_emails]
|
21
23
|
@ignore_dates = args[:ignore_dates]
|
22
24
|
@ignore_numbers = args[:ignore_numbers]
|
@@ -85,16 +87,16 @@ module ConfidentialInfoRedactorLite
|
|
85
87
|
private
|
86
88
|
|
87
89
|
def redact_hyperlinks_html(txt)
|
88
|
-
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
90
|
+
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
89
91
|
original_sentence_array = txt.split(' ')
|
90
92
|
redacted_sentence_array = redacted_text.split(' ')
|
91
93
|
diff = original_sentence_array - redacted_sentence_array
|
92
94
|
final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
93
|
-
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(
|
95
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(hyperlink_text)}/, "<span class='confidentialHyperlinks'>#{hyperlink_text}</span>"), final_hyperlinks_tokens]
|
94
96
|
end
|
95
97
|
|
96
98
|
def redact_numbers_html(txt)
|
97
|
-
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
99
|
+
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
98
100
|
original_sentence_array = txt.split(' ')
|
99
101
|
redacted_sentence_array = redacted_text.split(' ')
|
100
102
|
diff = original_sentence_array - redacted_sentence_array
|
@@ -103,12 +105,12 @@ module ConfidentialInfoRedactorLite
|
|
103
105
|
end
|
104
106
|
|
105
107
|
def redact_emails_html(txt)
|
106
|
-
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
108
|
+
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
|
107
109
|
original_sentence_array = txt.split(' ')
|
108
110
|
redacted_sentence_array = redacted_text.split(' ')
|
109
111
|
diff = original_sentence_array - redacted_sentence_array
|
110
112
|
final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
111
|
-
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(
|
113
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(email_text)}/, "<span class='confidentialEmail'>#{email_text}</span>"), final_email_tokens]
|
112
114
|
end
|
113
115
|
|
114
116
|
def redact_dates_html(txt)
|
@@ -155,7 +157,7 @@ module ConfidentialInfoRedactorLite
|
|
155
157
|
end
|
156
158
|
|
157
159
|
def redact_hyperlinks(txt)
|
158
|
-
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{
|
160
|
+
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
|
159
161
|
end
|
160
162
|
|
161
163
|
def redact_dates(txt)
|
@@ -167,7 +169,7 @@ module ConfidentialInfoRedactorLite
|
|
167
169
|
end
|
168
170
|
|
169
171
|
def redact_emails(txt)
|
170
|
-
txt.gsub(EMAIL_REGEX, "#{
|
172
|
+
txt.gsub(EMAIL_REGEX, "#{email_text}")
|
171
173
|
end
|
172
174
|
|
173
175
|
def redact_tokens(txt)
|
@@ -49,7 +49,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
|
49
49
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
50
50
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
51
51
|
ws = described_class.new(string: string)
|
52
|
-
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted> or <redacted> ")
|
52
|
+
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'replaces the hyperlinks in a string with regular tokens #002' do
|
@@ -88,33 +88,33 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
88
88
|
describe '#emails' do
|
89
89
|
it 'redacts email addresses from a text #001' do
|
90
90
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
91
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted> or you can try <redacted>.')
|
91
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted email> or you can try <redacted email>.')
|
92
92
|
end
|
93
93
|
|
94
94
|
it 'redacts email addresses from a text #002' do
|
95
95
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
96
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted>) or you can try (<redacted>).')
|
96
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
100
|
describe '#emails_html' do
|
101
101
|
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
102
102
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
103
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'
|
103
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
104
104
|
end
|
105
105
|
end
|
106
106
|
|
107
107
|
describe '#hyperlinks' do
|
108
108
|
it 'redacts hyperlinks from a text #001' do
|
109
109
|
text = 'Visit https://www.tm-town.com for more info.'
|
110
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted> for more info.')
|
110
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
|
111
111
|
end
|
112
112
|
end
|
113
113
|
|
114
114
|
describe '#hyperlinks_html' do
|
115
115
|
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
116
116
|
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
117
|
-
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
117
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
118
118
|
end
|
119
119
|
end
|
120
120
|
|
@@ -204,19 +204,19 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
204
204
|
it 'redacts all confidential information from a text #003' do
|
205
205
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
206
206
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
207
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.')
|
207
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
208
208
|
end
|
209
209
|
|
210
210
|
it 'redacts all confidential information from a text #004' do
|
211
211
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
212
212
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
213
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.')
|
213
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
214
214
|
end
|
215
215
|
|
216
216
|
it 'redacts all confidential information from a text #005' do
|
217
217
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
218
218
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
219
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
219
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
220
220
|
end
|
221
221
|
end
|
222
222
|
|
@@ -224,7 +224,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
224
224
|
it 'redacts all confidential information from a text #001' do
|
225
225
|
tokens = ['Coca-Cola', 'Pepsi']
|
226
226
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
227
|
-
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
227
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'redacts all confidential information from a text #002' do
|
231
|
+
tokens = ['Coca-Cola', 'Pepsi']
|
232
|
+
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
233
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
|
228
234
|
end
|
229
235
|
end
|
230
236
|
end
|