confidential_info_redactor_lite 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ad118f207a2c77d6576afe2450015ffb53362817
4
- data.tar.gz: 5c62ec610e4ca6361fbf977614eb6a8565424451
3
+ metadata.gz: 6b37f8797a8ca98088c77462fd3cc34f7bfbb9ef
4
+ data.tar.gz: ef75c18fada7655ab22d171c1741a9accce73b92
5
5
  SHA512:
6
- metadata.gz: 4da649dbf21c536a4ba5404636297447db7adcafb78e5cbd7375a5d8bd21ee4d087c192214cfe52167c5a0090fd853fdccbe0d9f9aaf7212bb7d422ceb0348ab
7
- data.tar.gz: 05c05072e9ebb34139c959f4e1c2351a36a6e48ea7d8bcd4a853cd75eea3ccc57e8fd4d5095e90a86c1fef8d84eb4657117f654457ed88eb1461fcb1962a6c28
6
+ metadata.gz: f63e4997217c4e89dd432427d3a3a1d734cf5c2703fbc03343c90444948e41d618b4b4f014c8ce167bec4e5899a70fc20e5adcea14e3eda63f5328dd0c3877b9
7
+ data.tar.gz: 110de24454372d0572f6a411ca4fd14a9471fb152509d9aecc6707a071c392307bda310ed058b82535755a422e9e59ea15761ca4662f259206ddd9ddea795156
@@ -20,9 +20,9 @@ module ConfidentialInfoRedactorLite
20
20
  new_string = string.dup
21
21
  string.split(/\s+/).each do |token|
22
22
  if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
23
+ new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
24
24
  elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
25
+ new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
26
26
  end
27
27
  end
28
28
  new_string
@@ -9,7 +9,7 @@ module ConfidentialInfoRedactorLite
9
9
  # Rubular: http://rubular.com/r/mxcj2G0Jfa
10
10
  EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
11
11
 
12
- attr_reader :text, :language, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
12
+ attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
13
  def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
14
14
  @text = text
15
15
  @language = args[:language] || 'en'
@@ -17,6 +17,8 @@ module ConfidentialInfoRedactorLite
17
17
  @number_text = args[:number_text] || '<redacted number>'
18
18
  @date_text = args[:date_text] || '<redacted date>'
19
19
  @token_text = args[:token_text] || '<redacted>'
20
+ @email_text = args[:email_text] || '<redacted email>'
21
+ @hyperlink_text = args[:hyperlink_text] || '<redacted hyperlink>'
20
22
  @ignore_emails = args[:ignore_emails]
21
23
  @ignore_dates = args[:ignore_dates]
22
24
  @ignore_numbers = args[:ignore_numbers]
@@ -85,16 +87,16 @@ module ConfidentialInfoRedactorLite
85
87
  private
86
88
 
87
89
  def redact_hyperlinks_html(txt)
88
- redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
90
+ redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
89
91
  original_sentence_array = txt.split(' ')
90
92
  redacted_sentence_array = redacted_text.split(' ')
91
93
  diff = original_sentence_array - redacted_sentence_array
92
94
  final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
93
- [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
95
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(hyperlink_text)}/, "<span class='confidentialHyperlinks'>#{hyperlink_text}</span>"), final_hyperlinks_tokens]
94
96
  end
95
97
 
96
98
  def redact_numbers_html(txt)
97
- redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
99
+ redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
98
100
  original_sentence_array = txt.split(' ')
99
101
  redacted_sentence_array = redacted_text.split(' ')
100
102
  diff = original_sentence_array - redacted_sentence_array
@@ -103,12 +105,12 @@ module ConfidentialInfoRedactorLite
103
105
  end
104
106
 
105
107
  def redact_emails_html(txt)
106
- redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
108
+ redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
107
109
  original_sentence_array = txt.split(' ')
108
110
  redacted_sentence_array = redacted_text.split(' ')
109
111
  diff = original_sentence_array - redacted_sentence_array
110
112
  final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
111
- [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
113
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(email_text)}/, "<span class='confidentialEmail'>#{email_text}</span>"), final_email_tokens]
112
114
  end
113
115
 
114
116
  def redact_dates_html(txt)
@@ -155,7 +157,7 @@ module ConfidentialInfoRedactorLite
155
157
  end
156
158
 
157
159
  def redact_hyperlinks(txt)
158
- ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
160
+ ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
159
161
  end
160
162
 
161
163
  def redact_dates(txt)
@@ -167,7 +169,7 @@ module ConfidentialInfoRedactorLite
167
169
  end
168
170
 
169
171
  def redact_emails(txt)
170
- txt.gsub(EMAIL_REGEX, "#{token_text}")
172
+ txt.gsub(EMAIL_REGEX, "#{email_text}")
171
173
  end
172
174
 
173
175
  def redact_tokens(txt)
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
@@ -49,7 +49,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
49
49
  it 'replaces the hyperlinks in a string with regular tokens #001' do
50
50
  string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
51
51
  ws = described_class.new(string: string)
52
- expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted> or <redacted> ")
52
+ expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
53
53
  end
54
54
 
55
55
  it 'replaces the hyperlinks in a string with regular tokens #002' do
@@ -88,33 +88,33 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
88
88
  describe '#emails' do
89
89
  it 'redacts email addresses from a text #001' do
90
90
  text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
91
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted> or you can try <redacted>.')
91
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted email> or you can try <redacted email>.')
92
92
  end
93
93
 
94
94
  it 'redacts email addresses from a text #002' do
95
95
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
96
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted>) or you can try (<redacted>).')
96
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
97
97
  end
98
98
  end
99
99
 
100
100
  describe '#emails_html' do
101
101
  it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
102
102
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
103
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
103
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
104
104
  end
105
105
  end
106
106
 
107
107
  describe '#hyperlinks' do
108
108
  it 'redacts hyperlinks from a text #001' do
109
109
  text = 'Visit https://www.tm-town.com for more info.'
110
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted> for more info.')
110
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
111
111
  end
112
112
  end
113
113
 
114
114
  describe '#hyperlinks_html' do
115
115
  it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
116
116
  text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
117
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
117
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
118
118
  end
119
119
  end
120
120
 
@@ -204,19 +204,19 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
204
204
  it 'redacts all confidential information from a text #003' do
205
205
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
206
206
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
207
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.')
207
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
208
208
  end
209
209
 
210
210
  it 'redacts all confidential information from a text #004' do
211
211
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
212
212
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
213
- expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.')
213
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
214
214
  end
215
215
 
216
216
  it 'redacts all confidential information from a text #005' do
217
217
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
218
218
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
219
- expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
219
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
220
220
  end
221
221
  end
222
222
 
@@ -224,7 +224,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
224
224
  it 'redacts all confidential information from a text #001' do
225
225
  tokens = ['Coca-Cola', 'Pepsi']
226
226
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
227
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
227
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
228
+ end
229
+
230
+ it 'redacts all confidential information from a text #002' do
231
+ tokens = ['Coca-Cola', 'Pepsi']
232
+ text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
233
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
228
234
  end
229
235
  end
230
236
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias