confidential_info_redactor_lite 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9a493d85c2cd235866ab329d987ebc634742127
|
4
|
+
data.tar.gz: a9fee602079d81f7d9e0970d59cca14197a8b31d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b77ba5ea584cb31bff9084d108eae3c0f2630a4bd9fe4a99603f12240fe267a14da5ee4be22fbb38bdfa5bba29054eca64cbb4a1b7755a0c926b2a43edec42b1
|
7
|
+
data.tar.gz: dc783fce3502f7e62f5ddabbca2e9c0901d9fbb5b88e2ec003ceb1c8dd6647cef766d0b876ad1bb6584570b98f73e8a4e652203adc042e20c4cb96ed83886e32
|
@@ -31,18 +31,34 @@ module ConfidentialInfoRedactorLite
|
|
31
31
|
redact_dates(text)
|
32
32
|
end
|
33
33
|
|
34
|
+
def dates_html
|
35
|
+
redact_dates_html(text)
|
36
|
+
end
|
37
|
+
|
34
38
|
def numbers
|
35
39
|
redact_numbers(text)
|
36
40
|
end
|
37
41
|
|
42
|
+
def numbers_html
|
43
|
+
redact_numbers_html(text)
|
44
|
+
end
|
45
|
+
|
38
46
|
def emails
|
39
47
|
redact_emails(text)
|
40
48
|
end
|
41
49
|
|
50
|
+
def emails_html
|
51
|
+
redact_emails_html(text)
|
52
|
+
end
|
53
|
+
|
42
54
|
def hyperlinks
|
43
55
|
redact_hyperlinks(text)
|
44
56
|
end
|
45
57
|
|
58
|
+
def hyperlinks_html
|
59
|
+
redact_hyperlinks_html(text)
|
60
|
+
end
|
61
|
+
|
46
62
|
def proper_nouns
|
47
63
|
redact_tokens(text)
|
48
64
|
end
|
@@ -59,8 +75,85 @@ module ConfidentialInfoRedactorLite
|
|
59
75
|
redact_tokens(redacted_text)
|
60
76
|
end
|
61
77
|
|
78
|
+
def redact_html
|
79
|
+
redacted_text = redact_dates_html(text)[0]
|
80
|
+
redacted_text = redact_emails_html(redacted_text)[0]
|
81
|
+
redacted_text = redact_hyperlinks_html(redacted_text)[0]
|
82
|
+
redact_numbers_html(redacted_text)[0]
|
83
|
+
end
|
84
|
+
|
62
85
|
private
|
63
86
|
|
87
|
+
def redact_hyperlinks_html(txt)
|
88
|
+
redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
89
|
+
original_sentence_array = txt.split(' ')
|
90
|
+
redacted_sentence_array = redacted_text.split(' ')
|
91
|
+
diff = original_sentence_array - redacted_sentence_array
|
92
|
+
final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
93
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
|
94
|
+
end
|
95
|
+
|
96
|
+
def redact_numbers_html(txt)
|
97
|
+
redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
98
|
+
original_sentence_array = txt.split(' ')
|
99
|
+
redacted_sentence_array = redacted_text.split(' ')
|
100
|
+
diff = original_sentence_array - redacted_sentence_array
|
101
|
+
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
102
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
|
103
|
+
end
|
104
|
+
|
105
|
+
def redact_emails_html(txt)
|
106
|
+
redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
|
107
|
+
original_sentence_array = txt.split(' ')
|
108
|
+
redacted_sentence_array = redacted_text.split(' ')
|
109
|
+
diff = original_sentence_array - redacted_sentence_array
|
110
|
+
final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
111
|
+
[redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
|
112
|
+
end
|
113
|
+
|
114
|
+
def redact_dates_html(txt)
|
115
|
+
redacted_text = redact_dates(txt)
|
116
|
+
original_sentence_array = txt.split(' ')
|
117
|
+
redacted_sentence_array = redacted_text.split(' ')
|
118
|
+
diff = original_sentence_array - redacted_sentence_array
|
119
|
+
date_tokens = []
|
120
|
+
redacted_text.split(' ').each_with_index do |redacted_token, index|
|
121
|
+
if redacted_token.gsub(/\./, '') == date_text
|
122
|
+
original_sentence_array.each_with_index do |original_token, i|
|
123
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
124
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
125
|
+
original_sentence_array[i + 2] == redacted_sentence_array[index + 1]
|
126
|
+
date_tokens << original_sentence_array[i + 1]
|
127
|
+
end
|
128
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
129
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
130
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
131
|
+
original_sentence_array[i + 3] == redacted_sentence_array[index + 1]
|
132
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2]
|
133
|
+
end
|
134
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
135
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
136
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
137
|
+
diff.include?(original_sentence_array[i + 3]) &&
|
138
|
+
original_sentence_array[i + 4] == redacted_sentence_array[index + 1]
|
139
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3]
|
140
|
+
end
|
141
|
+
if redacted_sentence_array[index - 1] == original_token &&
|
142
|
+
diff.include?(original_sentence_array[i + 1]) &&
|
143
|
+
diff.include?(original_sentence_array[i + 2]) &&
|
144
|
+
diff.include?(original_sentence_array[i + 3]) &&
|
145
|
+
diff.include?(original_sentence_array[i + 4]) &&
|
146
|
+
original_sentence_array[i + 5] == redacted_sentence_array[index + 1]
|
147
|
+
date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3] + ' ' + original_sentence_array[i + 4]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
final_date_tokens = date_tokens.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
154
|
+
[redacted_text.gsub(/#{Regexp.escape(date_text)}/, "<span class='confidentialDate'>#{date_text}</span>"), final_date_tokens]
|
155
|
+
end
|
156
|
+
|
64
157
|
def redact_hyperlinks(txt)
|
65
158
|
ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
|
66
159
|
end
|
@@ -44,6 +44,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
|
47
|
+
describe '#dates_html' do
|
48
|
+
it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
|
49
|
+
text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
50
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
47
54
|
describe '#numbers' do
|
48
55
|
it 'redacts numbers from a text #001' do
|
49
56
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
|
@@ -61,6 +68,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
61
68
|
end
|
62
69
|
end
|
63
70
|
|
71
|
+
describe '#numbers_html' do
|
72
|
+
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
|
73
|
+
text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
|
74
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["It was his <span class='confidentialNumber'>*****</span> time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
64
78
|
describe '#emails' do
|
65
79
|
it 'redacts email addresses from a text #001' do
|
66
80
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
@@ -73,6 +87,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
73
87
|
end
|
74
88
|
end
|
75
89
|
|
90
|
+
describe '#emails_html' do
|
91
|
+
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
92
|
+
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
93
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
76
97
|
describe '#hyperlinks' do
|
77
98
|
it 'redacts hyperlinks from a text #001' do
|
78
99
|
text = 'Visit https://www.tm-town.com for more info.'
|
@@ -80,6 +101,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
80
101
|
end
|
81
102
|
end
|
82
103
|
|
104
|
+
describe '#hyperlinks_html' do
|
105
|
+
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
106
|
+
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
107
|
+
expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
83
111
|
describe '#proper_nouns' do
|
84
112
|
it 'redacts tokens from a text #001' do
|
85
113
|
tokens = ['Coca-Cola', 'Pepsi']
|
@@ -181,4 +209,12 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
181
209
|
expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
182
210
|
end
|
183
211
|
end
|
212
|
+
|
213
|
+
describe '#redact_html' do
|
214
|
+
it 'redacts all confidential information from a text #001' do
|
215
|
+
tokens = ['Coca-Cola', 'Pepsi']
|
216
|
+
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
217
|
+
expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
218
|
+
end
|
219
|
+
end
|
184
220
|
end
|