confidential_info_redactor 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +7 -0
- data/lib/confidential_info_redactor/extractor.rb +2 -2
- data/lib/confidential_info_redactor/redactor.rb +1 -0
- data/lib/confidential_info_redactor/version.rb +1 -1
- data/spec/confidential_info_redactor/extractor_spec.rb +5 -0
- data/spec/confidential_info_redactor/redactor_spec.rb +14 -0
- metadata +6 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 64cb868a941bb737cc49f0ea98334c89cb719a9fc01e3ada9997e29df07347d1
|
|
4
|
+
data.tar.gz: 158de52cd0bfb45766e66e45a200590bb0b95282bb49688161393691f75bb92b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6c455ad2194369013dc3a451196d9df40b9df359eeea0e62de3a5650398a3d139beed9b3db2638728f498fbfedb281b35ed9e8e3d64d311de9709370366358fa
|
|
7
|
+
data.tar.gz: 5174e8f4ef98a9194b0554bee6e326072bea9e3b4b03187db55b50f4cf4df39c38940bdc6283badb1c62ea63d4e3e82463b135ebff352d38e7c4b38446b44f14
|
data/README.md
CHANGED
|
@@ -62,6 +62,13 @@ ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
|
|
|
62
62
|
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
|
|
63
63
|
# => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
|
64
64
|
|
|
65
|
+
# Possessive proper nouns are handled automatically
|
|
66
|
+
text = "The teacher took John's book from the shelf."
|
|
67
|
+
tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
|
|
68
|
+
# => ["John"]
|
|
69
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
|
|
70
|
+
# => "The teacher took <redacted>'s book from the shelf."
|
|
71
|
+
|
|
65
72
|
# It is possible to 'turn off' any of the specific redactors
|
|
66
73
|
ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
|
|
67
74
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
|
|
@@ -33,11 +33,11 @@ module ConfidentialInfoRedactor
|
|
|
33
33
|
private
|
|
34
34
|
|
|
35
35
|
def extract_preliminary_terms(segment)
|
|
36
|
-
segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
|
36
|
+
segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'s\z/, '').gsub(/\'$/, '')) }.compact
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def clean_token(token)
|
|
40
|
-
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
|
40
|
+
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'s\z/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def non_confidential_token?(token, includes_confidential)
|
|
@@ -75,6 +75,7 @@ module ConfidentialInfoRedactor
|
|
|
75
75
|
|
|
76
76
|
def redact_tokens(txt)
|
|
77
77
|
tokens.sort_by{ |x| x.split.count }.reverse.each do |token|
|
|
78
|
+
txt.gsub!(/(?<=\s|^|\")#{Regexp.escape(token)}(?=['\u2019]s\b)/, "#{token_text}")
|
|
78
79
|
txt.gsub!(/(?<=\s|^|\")#{Regexp.escape(token)}(?=\W|$)/, "#{token_text}")
|
|
79
80
|
end
|
|
80
81
|
txt.strip
|
|
@@ -143,6 +143,11 @@ RSpec.describe ConfidentialInfoRedactor::Extractor do
|
|
|
143
143
|
text = 'John'
|
|
144
144
|
expect(described_class.new(language: 'en').extract(text)).to eq(['John'])
|
|
145
145
|
end
|
|
146
|
+
|
|
147
|
+
it 'extracts the proper nouns from a text #018' do
|
|
148
|
+
text = "The teacher took John's book from the shelf."
|
|
149
|
+
expect(described_class.new(language: 'en').extract(text)).to eq(['John'])
|
|
150
|
+
end
|
|
146
151
|
end
|
|
147
152
|
|
|
148
153
|
context 'German (de)' do
|
|
@@ -181,5 +181,19 @@ RSpec.describe ConfidentialInfoRedactor::Redactor do
|
|
|
181
181
|
text = 'My Transformation - avoid Trans.'
|
|
182
182
|
expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****').redact(text)).to eq('My Transformation - avoid *****.')
|
|
183
183
|
end
|
|
184
|
+
|
|
185
|
+
it 'redacts all confidential information from a text #008' do
|
|
186
|
+
text = "The teacher took John's book from the shelf."
|
|
187
|
+
tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
|
|
188
|
+
expect(tokens).to eq(['John'])
|
|
189
|
+
expect(described_class.new(language: 'en', tokens: tokens).redact(text)).to eq("The teacher took <redacted>'s book from the shelf.")
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'redacts all confidential information from a text #009' do
|
|
193
|
+
text = "The cat's toy rolled under the couch and disappeared."
|
|
194
|
+
tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
|
|
195
|
+
expect(tokens).to eq([])
|
|
196
|
+
expect(described_class.new(language: 'en', tokens: tokens).redact(text)).to eq("The cat's toy rolled under the couch and disappeared.")
|
|
197
|
+
end
|
|
184
198
|
end
|
|
185
199
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: confidential_info_redactor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kevin S. Dias
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-03-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -98,7 +98,7 @@ homepage: https://github.com/diasks2/confidential_info_redactor
|
|
|
98
98
|
licenses:
|
|
99
99
|
- MIT
|
|
100
100
|
metadata: {}
|
|
101
|
-
post_install_message:
|
|
101
|
+
post_install_message:
|
|
102
102
|
rdoc_options: []
|
|
103
103
|
require_paths:
|
|
104
104
|
- lib
|
|
@@ -113,9 +113,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
113
113
|
- !ruby/object:Gem::Version
|
|
114
114
|
version: '0'
|
|
115
115
|
requirements: []
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
signing_key:
|
|
116
|
+
rubygems_version: 3.3.26
|
|
117
|
+
signing_key:
|
|
119
118
|
specification_version: 4
|
|
120
119
|
summary: Semi-automatically redact confidential information from a text
|
|
121
120
|
test_files:
|