confidential_info_redactor_lite 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/lib/confidential_info_redactor_lite/redactor.rb +8 -2
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/performance_spec.rb +41 -41
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +10 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 22ce6189b4e4889ade8350442036d46a8e3c4be9
|
4
|
+
data.tar.gz: d636d8803c0f10de0afbd35c8a560a9cb81cc454
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 851e356e7cd32ac7c43d1070532ee596e70df2bd19433e794297ce7f94f4ee32895791b2719cbff3063b437c778c24784215ce6b7ef7577a93b9b47f56035019
|
7
|
+
data.tar.gz: 4500d3135dd6bd2fcdf9eece3eba5b89035fb644879c268515469865b843c9ba0756be2e797a5070387b70657ed414c173b22456543ccea0c02125c4e53db501
|
data/README.md
CHANGED
@@ -39,40 +39,40 @@ gem 'confidential_info_redactor_lite'
|
|
39
39
|
```ruby
|
40
40
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
41
41
|
corpus = ['array', 'of', 'common', 'english', 'words']
|
42
|
-
tokens = ConfidentialInfoRedactorLite::Extractor.new(
|
42
|
+
tokens = ConfidentialInfoRedactorLite::Extractor.new(corpus: corpus).extract(text)
|
43
43
|
# => ["Coca-Cola", "Pepsi", "John Smith"]
|
44
44
|
|
45
45
|
en_dow = %w(monday tuesday wednesday thursday friday saturday sunday)
|
46
46
|
en_dow_abbr = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
47
47
|
en_months = %w(january february march april may june july august september october november december)
|
48
48
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
49
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
49
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
50
50
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
|
51
51
|
|
52
52
|
# You can also just use a specific redactor
|
53
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
53
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
54
54
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
55
55
|
|
56
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
56
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)
|
57
57
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
58
58
|
|
59
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
59
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)
|
60
60
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
|
61
61
|
|
62
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
62
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)
|
63
63
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
|
64
64
|
|
65
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
65
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)
|
66
66
|
# => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
67
67
|
|
68
68
|
# It is possible to 'turn off' any of the specific redactors
|
69
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
69
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
70
70
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
|
71
71
|
|
72
72
|
# It is also possible to change the redaction text
|
73
73
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
74
74
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
75
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
75
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
76
76
|
# => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
|
77
77
|
```
|
78
78
|
|
@@ -5,7 +5,7 @@ module ConfidentialInfoRedactorLite
|
|
5
5
|
# This class redacts various tokens from a text
|
6
6
|
class Redactor
|
7
7
|
# Rubular: http://rubular.com/r/OI2wQZ0KSl
|
8
|
-
NUMBER_REGEX = /(?<=\A|\A\()[^(]?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s|\s\()[^(]?\d+((,|\.)*\d)*(?=(\D?\s|\s|\.?\s|\.$))|(?<=\s)\d+(nd|th|st)|(?<=\s)\d+\/\d+\"*(?=\s)|(?<=\()\S{1}\d+(?=\))|(?<=\s{1})\S{1}\d+\z|^\d+$/
|
8
|
+
NUMBER_REGEX = /(?<=\A|\A\()[^(]?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s|\s\(|\s'|\s‘)[^('‘]?\d+((,|\.)*\d)*(?=(\D?\s|\s|\.?\s|\.$))|(?<=\s)\d+(nd|th|st)|(?<=\s)\d+\/\d+\"*(?=\s)|(?<=\()\S{1}\d+(?=\))|(?<=\s{1})\S{1}\d+\z|^\d+$/
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
@@ -117,7 +117,13 @@ module ConfidentialInfoRedactorLite
|
|
117
117
|
original_sentence_array = txt.split(' ')
|
118
118
|
redacted_sentence_array = redacted_text.split(' ')
|
119
119
|
diff = original_sentence_array - redacted_sentence_array
|
120
|
-
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
120
|
+
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
121
|
+
.map { |token| token[-1].eql?(')') ? token[0...-1] : token }
|
122
|
+
.map { |token| token[-1].eql?("'") ? token[0...-1] : token }
|
123
|
+
.map { |token| token[-1].eql?('’') ? token[0...-1] : token }
|
124
|
+
.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
125
|
+
.map { |token| token[0].eql?("'") ? token[1..token.length] : token }
|
126
|
+
.map { |token| token[0].eql?("‘") ? token[1..token.length] : token }
|
121
127
|
end
|
122
128
|
[redacted_text.gsub(/(?<=[^\>]|\A)#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
|
123
129
|
end
|