confidential_info_redactor_lite 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ad118f207a2c77d6576afe2450015ffb53362817
4
- data.tar.gz: 5c62ec610e4ca6361fbf977614eb6a8565424451
3
+ metadata.gz: 6b37f8797a8ca98088c77462fd3cc34f7bfbb9ef
4
+ data.tar.gz: ef75c18fada7655ab22d171c1741a9accce73b92
5
5
  SHA512:
6
- metadata.gz: 4da649dbf21c536a4ba5404636297447db7adcafb78e5cbd7375a5d8bd21ee4d087c192214cfe52167c5a0090fd853fdccbe0d9f9aaf7212bb7d422ceb0348ab
7
- data.tar.gz: 05c05072e9ebb34139c959f4e1c2351a36a6e48ea7d8bcd4a853cd75eea3ccc57e8fd4d5095e90a86c1fef8d84eb4657117f654457ed88eb1461fcb1962a6c28
6
+ metadata.gz: f63e4997217c4e89dd432427d3a3a1d734cf5c2703fbc03343c90444948e41d618b4b4f014c8ce167bec4e5899a70fc20e5adcea14e3eda63f5328dd0c3877b9
7
+ data.tar.gz: 110de24454372d0572f6a411ca4fd14a9471fb152509d9aecc6707a071c392307bda310ed058b82535755a422e9e59ea15761ca4662f259206ddd9ddea795156
@@ -20,9 +20,9 @@ module ConfidentialInfoRedactorLite
20
20
  new_string = string.dup
21
21
  string.split(/\s+/).each do |token|
22
22
  if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
23
+ new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
24
24
  elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
25
+ new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ')
26
26
  end
27
27
  end
28
28
  new_string
@@ -9,7 +9,7 @@ module ConfidentialInfoRedactorLite
9
9
  # Rubular: http://rubular.com/r/mxcj2G0Jfa
10
10
  EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
11
11
 
12
- attr_reader :text, :language, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
12
+ attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
13
  def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
14
14
  @text = text
15
15
  @language = args[:language] || 'en'
@@ -17,6 +17,8 @@ module ConfidentialInfoRedactorLite
17
17
  @number_text = args[:number_text] || '<redacted number>'
18
18
  @date_text = args[:date_text] || '<redacted date>'
19
19
  @token_text = args[:token_text] || '<redacted>'
20
+ @email_text = args[:email_text] || '<redacted email>'
21
+ @hyperlink_text = args[:hyperlink_text] || '<redacted hyperlink>'
20
22
  @ignore_emails = args[:ignore_emails]
21
23
  @ignore_dates = args[:ignore_dates]
22
24
  @ignore_numbers = args[:ignore_numbers]
@@ -85,16 +87,16 @@ module ConfidentialInfoRedactorLite
85
87
  private
86
88
 
87
89
  def redact_hyperlinks_html(txt)
88
- redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
90
+ redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
89
91
  original_sentence_array = txt.split(' ')
90
92
  redacted_sentence_array = redacted_text.split(' ')
91
93
  diff = original_sentence_array - redacted_sentence_array
92
94
  final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
93
- [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
95
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(hyperlink_text)}/, "<span class='confidentialHyperlinks'>#{hyperlink_text}</span>"), final_hyperlinks_tokens]
94
96
  end
95
97
 
96
98
  def redact_numbers_html(txt)
97
- redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
99
+ redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
98
100
  original_sentence_array = txt.split(' ')
99
101
  redacted_sentence_array = redacted_text.split(' ')
100
102
  diff = original_sentence_array - redacted_sentence_array
@@ -103,12 +105,12 @@ module ConfidentialInfoRedactorLite
103
105
  end
104
106
 
105
107
  def redact_emails_html(txt)
106
- redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
108
+ redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<").gsub(/\>\s#{Regexp.escape(email_text)}\s\</, ">#{email_text}<").gsub(/\>\s#{Regexp.escape(hyperlink_text)}\s\</, ">#{hyperlink_text}<")
107
109
  original_sentence_array = txt.split(' ')
108
110
  redacted_sentence_array = redacted_text.split(' ')
109
111
  diff = original_sentence_array - redacted_sentence_array
110
112
  final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
111
- [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
113
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(email_text)}/, "<span class='confidentialEmail'>#{email_text}</span>"), final_email_tokens]
112
114
  end
113
115
 
114
116
  def redact_dates_html(txt)
@@ -155,7 +157,7 @@ module ConfidentialInfoRedactorLite
155
157
  end
156
158
 
157
159
  def redact_hyperlinks(txt)
158
- ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
160
+ ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
159
161
  end
160
162
 
161
163
  def redact_dates(txt)
@@ -167,7 +169,7 @@ module ConfidentialInfoRedactorLite
167
169
  end
168
170
 
169
171
  def redact_emails(txt)
170
- txt.gsub(EMAIL_REGEX, "#{token_text}")
172
+ txt.gsub(EMAIL_REGEX, "#{email_text}")
171
173
  end
172
174
 
173
175
  def redact_tokens(txt)
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
@@ -49,7 +49,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
49
49
  it 'replaces the hyperlinks in a string with regular tokens #001' do
50
50
  string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
51
51
  ws = described_class.new(string: string)
52
- expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted> or <redacted> ")
52
+ expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
53
53
  end
54
54
 
55
55
  it 'replaces the hyperlinks in a string with regular tokens #002' do
@@ -88,33 +88,33 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
88
88
  describe '#emails' do
89
89
  it 'redacts email addresses from a text #001' do
90
90
  text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
91
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted> or you can try <redacted>.')
91
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted email> or you can try <redacted email>.')
92
92
  end
93
93
 
94
94
  it 'redacts email addresses from a text #002' do
95
95
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
96
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted>) or you can try (<redacted>).')
96
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
97
97
  end
98
98
  end
99
99
 
100
100
  describe '#emails_html' do
101
101
  it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
102
102
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
103
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
103
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
104
104
  end
105
105
  end
106
106
 
107
107
  describe '#hyperlinks' do
108
108
  it 'redacts hyperlinks from a text #001' do
109
109
  text = 'Visit https://www.tm-town.com for more info.'
110
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted> for more info.')
110
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
111
111
  end
112
112
  end
113
113
 
114
114
  describe '#hyperlinks_html' do
115
115
  it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
116
116
  text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
117
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
117
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
118
118
  end
119
119
  end
120
120
 
@@ -204,19 +204,19 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
204
204
  it 'redacts all confidential information from a text #003' do
205
205
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
206
206
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
207
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.')
207
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
208
208
  end
209
209
 
210
210
  it 'redacts all confidential information from a text #004' do
211
211
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
212
212
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
213
- expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.')
213
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
214
214
  end
215
215
 
216
216
  it 'redacts all confidential information from a text #005' do
217
217
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
218
218
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
219
- expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
219
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
220
220
  end
221
221
  end
222
222
 
@@ -224,7 +224,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
224
224
  it 'redacts all confidential information from a text #001' do
225
225
  tokens = ['Coca-Cola', 'Pepsi']
226
226
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
227
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
227
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
228
+ end
229
+
230
+ it 'redacts all confidential information from a text #002' do
231
+ tokens = ['Coca-Cola', 'Pepsi']
232
+ text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
233
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
228
234
  end
229
235
  end
230
236
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias