confidential_info_redactor_lite 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9a493d85c2cd235866ab329d987ebc634742127
|
4
|
+
data.tar.gz: a9fee602079d81f7d9e0970d59cca14197a8b31d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b77ba5ea584cb31bff9084d108eae3c0f2630a4bd9fe4a99603f12240fe267a14da5ee4be22fbb38bdfa5bba29054eca64cbb4a1b7755a0c926b2a43edec42b1
|
7
|
+
data.tar.gz: dc783fce3502f7e62f5ddabbca2e9c0901d9fbb5b88e2ec003ceb1c8dd6647cef766d0b876ad1bb6584570b98f73e8a4e652203adc042e20c4cb96ed83886e32
|
@@ -31,18 +31,34 @@ module ConfidentialInfoRedactorLite
|
|
31
31
|
redact_dates(text)
|
32
32
|
end
|
33
33
|
|
34
|
+
def dates_html
|
35
|
+
redact_dates_html(text)
|
36
|
+
end
|
37
|
+
|
34
38
|
def numbers
|
35
39
|
redact_numbers(text)
|
36
40
|
end
|
37
41
|
|
42
|
+
def numbers_html
|
43
|
+
redact_numbers_html(text)
|
44
|
+
end
|
45
|
+
|
38
46
|
def emails
|
39
47
|
redact_emails(text)
|
40
48
|
end
|
41
49
|
|
50
|
+
def emails_html
|
51
|
+
redact_emails_html(text)
|
52
|
+
end
|
53
|
+
|
42
54
|
def hyperlinks
|
43
55
|
redact_hyperlinks(text)
|
44
56
|
end
|
45
57
|
|
58
|
+
def hyperlinks_html
|
59
|
+
redact_hyperlinks_html(text)
|
60
|
+
end
|
61
|
+
|
46
62
|
def proper_nouns
|
47
63
|
redact_tokens(text)
|
48
64
|
end
|
@@ -59,8 +75,85 @@ module ConfidentialInfoRedactorLite
|
|
59
75
|
redact_tokens(redacted_text)
|
60
76
|
end
|
61
77
|
|
78
|
+
def redact_html
|
79
|
+
redacted_text = redact_dates_html(text)[0]
|
80
|
+
redacted_text = redact_emails_html(redacted_text)[0]
|
81
|
+
redacted_text = redact_hyperlinks_html(redacted_text)[0]
|
82
|
+
redact_numbers_html(redacted_text)[0]
|
83
|
+
end
|
84
|
+
|
62
85
|
private
|
63
86
|
|
87
|
+
def redact_hyperlinks_html(txt)
|
88
|
+
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
89
|
+
original_sentence_array = txt.split(' ')
|
90
|
+
redacted_sentence_array = redacted_text.split(' ')
|
91
|
+
diff = original_sentence_array - redacted_sentence_array
|
92
|
+
final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
93
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
|
94
|
+
end
|
95
|
+
|
96
|
+
def redact_numbers_html(txt)
|
97
|
+
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
98
|
+
original_sentence_array = txt.split(' ')
|
99
|
+
redacted_sentence_array = redacted_text.split(' ')
|
100
|
+
diff = original_sentence_array - redacted_sentence_array
|
101
|
+
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
102
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
|
103
|
+
end
|
104
|
+
|
105
|
+
def redact_emails_html(txt)
|
106
|
+
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
107
|
+
original_sentence_array = txt.split(' ')
|
108
|
+
redacted_sentence_array = redacted_text.split(' ')
|
109
|
+
diff = original_sentence_array - redacted_sentence_array
|
110
|
+
final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
111
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
|
112
|
+
end
|
113
|
+
|
114
|
+
def redact_dates_html(txt)
|
115
|
+
redacted_text = redact_dates(txt)
|
116
|
+
original_sentence_array = txt.split(' ')
|
117
|
+
redacted_sentence_array = redacted_text.split(' ')
|
118
|
+
diff = original_sentence_array - redacted_sentence_array
|
119
|
+
date_tokens = []
|
120
|
+
redacted_text.split(' ').each_with_index do |redacted_token, index|
|
121
|
+
if redacted_token.gsub(/\./, '') == date_text
|
122
|
+
original_sentence_array.each_with_index do |original_token, i|
|
123
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
124
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
125
|
+
original_sentence_array[i + 2] == redacted_sentence_array[index + 1]
|
126
|
+
date_tokens << original_sentence_array[i + 1]
|
127
|
+
end
|
128
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
129
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
130
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
131
|
+
original_sentence_array[i + 3] == redacted_sentence_array[index + 1]
|
132
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2]
|
133
|
+
end
|
134
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
135
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
136
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
137
|
+
diff.include?(original_sentence_array[i + 3]) &&
|
138
|
+
original_sentence_array[i + 4] == redacted_sentence_array[index + 1]
|
139
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3]
|
140
|
+
end
|
141
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
142
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
143
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
144
|
+
diff.include?(original_sentence_array[i + 3]) &&
|
145
|
+
diff.include?(original_sentence_array[i + 4]) &&
|
146
|
+
original_sentence_array[i + 5] == redacted_sentence_array[index + 1]
|
147
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3] + ' ' + original_sentence_array[i + 4]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
final_date_tokens = date_tokens.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
154
|
+
[redacted_text.gsub(/#{Regexp.escape(date_text)}/, "<span class='confidentialDate'>#{date_text}</span>"), final_date_tokens]
|
155
|
+
end
|
156
|
+
|
64
157
|
def redact_hyperlinks(txt)
|
65
158
|
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
|
66
159
|
end
|
@@ -44,6 +44,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
|
47
|
+
describe '#dates_html' do
|
48
|
+
it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
|
49
|
+
text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
50
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
47
54
|
describe '#numbers' do
|
48
55
|
it 'redacts numbers from a text #001' do
|
49
56
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
|
@@ -61,6 +68,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
61
68
|
end
|
62
69
|
end
|
63
70
|
|
71
|
+
describe '#numbers_html' do
|
72
|
+
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
|
73
|
+
text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
|
74
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["It was his <span class='confidentialNumber'>*****</span> time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
64
78
|
describe '#emails' do
|
65
79
|
it 'redacts email addresses from a text #001' do
|
66
80
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
@@ -73,6 +87,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
73
87
|
end
|
74
88
|
end
|
75
89
|
|
90
|
+
describe '#emails_html' do
|
91
|
+
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
92
|
+
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
93
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
76
97
|
describe '#hyperlinks' do
|
77
98
|
it 'redacts hyperlinks from a text #001' do
|
78
99
|
text = 'Visit https://www.tm-town.com for more info.'
|
@@ -80,6 +101,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
80
101
|
end
|
81
102
|
end
|
82
103
|
|
104
|
+
describe '#hyperlinks_html' do
|
105
|
+
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
106
|
+
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
107
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
83
111
|
describe '#proper_nouns' do
|
84
112
|
it 'redacts tokens from a text #001' do
|
85
113
|
tokens = ['Coca-Cola', 'Pepsi']
|
@@ -181,4 +209,12 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
181
209
|
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
182
210
|
end
|
183
211
|
end
|
212
|
+
|
213
|
+
describe '#redact_html' do
|
214
|
+
it 'redacts all confidential information from a text #001' do
|
215
|
+
tokens = ['Coca-Cola', 'Pepsi']
|
216
|
+
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
217
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
218
|
+
end
|
219
|
+
end
|
184
220
|
end
|