confidential_info_redactor_lite 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 216e96faca24f56d0c98efd9fe537d443ebf99ee
4
- data.tar.gz: 79fd2f8c443c6d3875298b860a81ba84b4d68113
3
+ metadata.gz: b9a493d85c2cd235866ab329d987ebc634742127
4
+ data.tar.gz: a9fee602079d81f7d9e0970d59cca14197a8b31d
5
5
  SHA512:
6
- metadata.gz: f82751b04aa5c32af2de0fc4048a3fa79a82dbf0a9fdbe63b8118371172c62fc0b0d9cffa9eb036e1529e2b66b1ff37ce77e1cb4ee849fe45cff55cfd0b5ab5d
7
- data.tar.gz: 7b2e643f54eb7bcfd624f16fc64ea2301f99cbad9c19a8b11c10faa1a3bd9282a319fbbd506ece94a3ea42246e8f310337a73ff2a5611632b8d0e9f04aab83e6
6
+ metadata.gz: b77ba5ea584cb31bff9084d108eae3c0f2630a4bd9fe4a99603f12240fe267a14da5ee4be22fbb38bdfa5bba29054eca64cbb4a1b7755a0c926b2a43edec42b1
7
+ data.tar.gz: dc783fce3502f7e62f5ddabbca2e9c0901d9fbb5b88e2ec003ceb1c8dd6647cef766d0b876ad1bb6584570b98f73e8a4e652203adc042e20c4cb96ed83886e32
@@ -31,18 +31,34 @@ module ConfidentialInfoRedactorLite
31
31
  redact_dates(text)
32
32
  end
33
33
 
34
+ def dates_html
35
+ redact_dates_html(text)
36
+ end
37
+
34
38
  def numbers
35
39
  redact_numbers(text)
36
40
  end
37
41
 
42
+ def numbers_html
43
+ redact_numbers_html(text)
44
+ end
45
+
38
46
  def emails
39
47
  redact_emails(text)
40
48
  end
41
49
 
50
+ def emails_html
51
+ redact_emails_html(text)
52
+ end
53
+
42
54
  def hyperlinks
43
55
  redact_hyperlinks(text)
44
56
  end
45
57
 
58
+ def hyperlinks_html
59
+ redact_hyperlinks_html(text)
60
+ end
61
+
46
62
  def proper_nouns
47
63
  redact_tokens(text)
48
64
  end
@@ -59,8 +75,85 @@ module ConfidentialInfoRedactorLite
59
75
  redact_tokens(redacted_text)
60
76
  end
61
77
 
78
+ def redact_html
79
+ redacted_text = redact_dates_html(text)[0]
80
+ redacted_text = redact_emails_html(redacted_text)[0]
81
+ redacted_text = redact_hyperlinks_html(redacted_text)[0]
82
+ redact_numbers_html(redacted_text)[0]
83
+ end
84
+
62
85
  private
63
86
 
87
+ def redact_hyperlinks_html(txt)
88
+ redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
89
+ original_sentence_array = txt.split(' ')
90
+ redacted_sentence_array = redacted_text.split(' ')
91
+ diff = original_sentence_array - redacted_sentence_array
92
+ final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
93
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
94
+ end
95
+
96
+ def redact_numbers_html(txt)
97
+ redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
98
+ original_sentence_array = txt.split(' ')
99
+ redacted_sentence_array = redacted_text.split(' ')
100
+ diff = original_sentence_array - redacted_sentence_array
101
+ final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
102
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
103
+ end
104
+
105
+ def redact_emails_html(txt)
106
+ redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
107
+ original_sentence_array = txt.split(' ')
108
+ redacted_sentence_array = redacted_text.split(' ')
109
+ diff = original_sentence_array - redacted_sentence_array
110
+ final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
111
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
112
+ end
113
+
114
+ def redact_dates_html(txt)
115
+ redacted_text = redact_dates(txt)
116
+ original_sentence_array = txt.split(' ')
117
+ redacted_sentence_array = redacted_text.split(' ')
118
+ diff = original_sentence_array - redacted_sentence_array
119
+ date_tokens = []
120
+ redacted_text.split(' ').each_with_index do |redacted_token, index|
121
+ if redacted_token.gsub(/\./, '') == date_text
122
+ original_sentence_array.each_with_index do |original_token, i|
123
+ if redacted_sentence_array[index - 1] == original_token &&
124
+ diff.include?(original_sentence_array[i + 1]) &&
125
+ original_sentence_array[i + 2] == redacted_sentence_array[index + 1]
126
+ date_tokens << original_sentence_array[i + 1]
127
+ end
128
+ if redacted_sentence_array[index - 1] == original_token &&
129
+ diff.include?(original_sentence_array[i + 1]) &&
130
+ diff.include?(original_sentence_array[i + 2]) &&
131
+ original_sentence_array[i + 3] == redacted_sentence_array[index + 1]
132
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2]
133
+ end
134
+ if redacted_sentence_array[index - 1] == original_token &&
135
+ diff.include?(original_sentence_array[i + 1]) &&
136
+ diff.include?(original_sentence_array[i + 2]) &&
137
+ diff.include?(original_sentence_array[i + 3]) &&
138
+ original_sentence_array[i + 4] == redacted_sentence_array[index + 1]
139
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3]
140
+ end
141
+ if redacted_sentence_array[index - 1] == original_token &&
142
+ diff.include?(original_sentence_array[i + 1]) &&
143
+ diff.include?(original_sentence_array[i + 2]) &&
144
+ diff.include?(original_sentence_array[i + 3]) &&
145
+ diff.include?(original_sentence_array[i + 4]) &&
146
+ original_sentence_array[i + 5] == redacted_sentence_array[index + 1]
147
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3] + ' ' + original_sentence_array[i + 4]
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ final_date_tokens = date_tokens.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
154
+ [redacted_text.gsub(/#{Regexp.escape(date_text)}/, "<span class='confidentialDate'>#{date_text}</span>"), final_date_tokens]
155
+ end
156
+
64
157
  def redact_hyperlinks(txt)
65
158
  ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
66
159
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -44,6 +44,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
44
44
  end
45
45
  end
46
46
 
47
+ describe '#dates_html' do
48
+ it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
49
+ text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
50
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
51
+ end
52
+ end
53
+
47
54
  describe '#numbers' do
48
55
  it 'redacts numbers from a text #001' do
49
56
  text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
@@ -61,6 +68,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
61
68
  end
62
69
  end
63
70
 
71
+ describe '#numbers_html' do
72
+ it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
73
+ text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
74
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["It was his <span class='confidentialNumber'>*****</span> time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
75
+ end
76
+ end
77
+
64
78
  describe '#emails' do
65
79
  it 'redacts email addresses from a text #001' do
66
80
  text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
@@ -73,6 +87,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
73
87
  end
74
88
  end
75
89
 
90
+ describe '#emails_html' do
91
+ it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
92
+ text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
93
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
94
+ end
95
+ end
96
+
76
97
  describe '#hyperlinks' do
77
98
  it 'redacts hyperlinks from a text #001' do
78
99
  text = 'Visit https://www.tm-town.com for more info.'
@@ -80,6 +101,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
80
101
  end
81
102
  end
82
103
 
104
+ describe '#hyperlinks_html' do
105
+ it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
106
+ text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
107
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
108
+ end
109
+ end
110
+
83
111
  describe '#proper_nouns' do
84
112
  it 'redacts tokens from a text #001' do
85
113
  tokens = ['Coca-Cola', 'Pepsi']
@@ -181,4 +209,12 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
181
209
  expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
182
210
  end
183
211
  end
212
+
213
+ describe '#redact_html' do
214
+ it 'redacts all confidential information from a text #001' do
215
+ tokens = ['Coca-Cola', 'Pepsi']
216
+ text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
217
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
218
+ end
219
+ end
184
220
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias