confidential_info_redactor_lite 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 216e96faca24f56d0c98efd9fe537d443ebf99ee
4
- data.tar.gz: 79fd2f8c443c6d3875298b860a81ba84b4d68113
3
+ metadata.gz: b9a493d85c2cd235866ab329d987ebc634742127
4
+ data.tar.gz: a9fee602079d81f7d9e0970d59cca14197a8b31d
5
5
  SHA512:
6
- metadata.gz: f82751b04aa5c32af2de0fc4048a3fa79a82dbf0a9fdbe63b8118371172c62fc0b0d9cffa9eb036e1529e2b66b1ff37ce77e1cb4ee849fe45cff55cfd0b5ab5d
7
- data.tar.gz: 7b2e643f54eb7bcfd624f16fc64ea2301f99cbad9c19a8b11c10faa1a3bd9282a319fbbd506ece94a3ea42246e8f310337a73ff2a5611632b8d0e9f04aab83e6
6
+ metadata.gz: b77ba5ea584cb31bff9084d108eae3c0f2630a4bd9fe4a99603f12240fe267a14da5ee4be22fbb38bdfa5bba29054eca64cbb4a1b7755a0c926b2a43edec42b1
7
+ data.tar.gz: dc783fce3502f7e62f5ddabbca2e9c0901d9fbb5b88e2ec003ceb1c8dd6647cef766d0b876ad1bb6584570b98f73e8a4e652203adc042e20c4cb96ed83886e32
@@ -31,18 +31,34 @@ module ConfidentialInfoRedactorLite
31
31
  redact_dates(text)
32
32
  end
33
33
 
34
+ def dates_html
35
+ redact_dates_html(text)
36
+ end
37
+
34
38
  def numbers
35
39
  redact_numbers(text)
36
40
  end
37
41
 
42
+ def numbers_html
43
+ redact_numbers_html(text)
44
+ end
45
+
38
46
  def emails
39
47
  redact_emails(text)
40
48
  end
41
49
 
50
+ def emails_html
51
+ redact_emails_html(text)
52
+ end
53
+
42
54
  def hyperlinks
43
55
  redact_hyperlinks(text)
44
56
  end
45
57
 
58
+ def hyperlinks_html
59
+ redact_hyperlinks_html(text)
60
+ end
61
+
46
62
  def proper_nouns
47
63
  redact_tokens(text)
48
64
  end
@@ -59,8 +75,85 @@ module ConfidentialInfoRedactorLite
59
75
  redact_tokens(redacted_text)
60
76
  end
61
77
 
78
+ def redact_html
79
+ redacted_text = redact_dates_html(text)[0]
80
+ redacted_text = redact_emails_html(redacted_text)[0]
81
+ redacted_text = redact_hyperlinks_html(redacted_text)[0]
82
+ redact_numbers_html(redacted_text)[0]
83
+ end
84
+
62
85
  private
63
86
 
87
+ def redact_hyperlinks_html(txt)
88
+ redacted_text = redact_hyperlinks(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
89
+ original_sentence_array = txt.split(' ')
90
+ redacted_sentence_array = redacted_text.split(' ')
91
+ diff = original_sentence_array - redacted_sentence_array
92
+ final_hyperlinks_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
93
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialHyperlinks'>#{token_text}</span>"), final_hyperlinks_tokens]
94
+ end
95
+
96
+ def redact_numbers_html(txt)
97
+ redacted_text = redact_numbers(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
98
+ original_sentence_array = txt.split(' ')
99
+ redacted_sentence_array = redacted_text.split(' ')
100
+ diff = original_sentence_array - redacted_sentence_array
101
+ final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
102
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
103
+ end
104
+
105
+ def redact_emails_html(txt)
106
+ redacted_text = redact_emails(txt).gsub(/\>\s#{Regexp.escape(token_text)}\s\</, ">#{token_text}<").gsub(/\>\s#{Regexp.escape(number_text)}\s\</, ">#{number_text}<").gsub(/\>\s#{Regexp.escape(date_text)}\s\</, ">#{date_text}<")
107
+ original_sentence_array = txt.split(' ')
108
+ redacted_sentence_array = redacted_text.split(' ')
109
+ diff = original_sentence_array - redacted_sentence_array
110
+ final_email_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }.map { |token| token[-1].eql?(')') ? token[0...-1] : token }.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
111
+ [redacted_text.gsub(/(?<=[^\>])#{Regexp.escape(token_text)}/, "<span class='confidentialEmail'>#{token_text}</span>"), final_email_tokens]
112
+ end
113
+
114
+ def redact_dates_html(txt)
115
+ redacted_text = redact_dates(txt)
116
+ original_sentence_array = txt.split(' ')
117
+ redacted_sentence_array = redacted_text.split(' ')
118
+ diff = original_sentence_array - redacted_sentence_array
119
+ date_tokens = []
120
+ redacted_text.split(' ').each_with_index do |redacted_token, index|
121
+ if redacted_token.gsub(/\./, '') == date_text
122
+ original_sentence_array.each_with_index do |original_token, i|
123
+ if redacted_sentence_array[index - 1] == original_token &&
124
+ diff.include?(original_sentence_array[i + 1]) &&
125
+ original_sentence_array[i + 2] == redacted_sentence_array[index + 1]
126
+ date_tokens << original_sentence_array[i + 1]
127
+ end
128
+ if redacted_sentence_array[index - 1] == original_token &&
129
+ diff.include?(original_sentence_array[i + 1]) &&
130
+ diff.include?(original_sentence_array[i + 2]) &&
131
+ original_sentence_array[i + 3] == redacted_sentence_array[index + 1]
132
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2]
133
+ end
134
+ if redacted_sentence_array[index - 1] == original_token &&
135
+ diff.include?(original_sentence_array[i + 1]) &&
136
+ diff.include?(original_sentence_array[i + 2]) &&
137
+ diff.include?(original_sentence_array[i + 3]) &&
138
+ original_sentence_array[i + 4] == redacted_sentence_array[index + 1]
139
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3]
140
+ end
141
+ if redacted_sentence_array[index - 1] == original_token &&
142
+ diff.include?(original_sentence_array[i + 1]) &&
143
+ diff.include?(original_sentence_array[i + 2]) &&
144
+ diff.include?(original_sentence_array[i + 3]) &&
145
+ diff.include?(original_sentence_array[i + 4]) &&
146
+ original_sentence_array[i + 5] == redacted_sentence_array[index + 1]
147
+ date_tokens << original_sentence_array[i + 1] + ' ' + original_sentence_array[i + 2] + ' ' + original_sentence_array[i + 3] + ' ' + original_sentence_array[i + 4]
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ final_date_tokens = date_tokens.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
154
+ [redacted_text.gsub(/#{Regexp.escape(date_text)}/, "<span class='confidentialDate'>#{date_text}</span>"), final_date_tokens]
155
+ end
156
+
64
157
  def redact_hyperlinks(txt)
65
158
  ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},")
66
159
  end
@@ -1,3 +1,3 @@
1
1
  module ConfidentialInfoRedactorLite
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -44,6 +44,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
44
44
  end
45
45
  end
46
46
 
47
+ describe '#dates_html' do
48
+ it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
49
+ text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
50
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
51
+ end
52
+ end
53
+
47
54
  describe '#numbers' do
48
55
  it 'redacts numbers from a text #001' do
49
56
  text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
@@ -61,6 +68,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
61
68
  end
62
69
  end
63
70
 
71
+ describe '#numbers_html' do
72
+ it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
73
+ text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
74
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["It was his <span class='confidentialNumber'>*****</span> time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
75
+ end
76
+ end
77
+
64
78
  describe '#emails' do
65
79
  it 'redacts email addresses from a text #001' do
66
80
  text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
@@ -73,6 +87,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
73
87
  end
74
88
  end
75
89
 
90
+ describe '#emails_html' do
91
+ it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
92
+ text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
93
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'>*****</span>) or you can try (<span class='confidentialEmail'>*****</span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
94
+ end
95
+ end
96
+
76
97
  describe '#hyperlinks' do
77
98
  it 'redacts hyperlinks from a text #001' do
78
99
  text = 'Visit https://www.tm-town.com for more info.'
@@ -80,6 +101,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
80
101
  end
81
102
  end
82
103
 
104
+ describe '#hyperlinks_html' do
105
+ it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
106
+ text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
107
+ expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
108
+ end
109
+ end
110
+
83
111
  describe '#proper_nouns' do
84
112
  it 'redacts tokens from a text #001' do
85
113
  tokens = ['Coca-Cola', 'Pepsi']
@@ -181,4 +209,12 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
181
209
  expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
182
210
  end
183
211
  end
212
+
213
+ describe '#redact_html' do
214
+ it 'redacts all confidential information from a text #001' do
215
+ tokens = ['Coca-Cola', 'Pepsi']
216
+ text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
217
+ expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
218
+ end
219
+ end
184
220
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: confidential_info_redactor_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias