confidential_info_redactor_lite 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/lib/confidential_info_redactor_lite/redactor.rb +8 -2
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/performance_spec.rb +41 -41
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +10 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 22ce6189b4e4889ade8350442036d46a8e3c4be9
|
4
|
+
data.tar.gz: d636d8803c0f10de0afbd35c8a560a9cb81cc454
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 851e356e7cd32ac7c43d1070532ee596e70df2bd19433e794297ce7f94f4ee32895791b2719cbff3063b437c778c24784215ce6b7ef7577a93b9b47f56035019
|
7
|
+
data.tar.gz: 4500d3135dd6bd2fcdf9eece3eba5b89035fb644879c268515469865b843c9ba0756be2e797a5070387b70657ed414c173b22456543ccea0c02125c4e53db501
|
data/README.md
CHANGED
@@ -39,40 +39,40 @@ gem 'confidential_info_redactor_lite'
|
|
39
39
|
```ruby
|
40
40
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
41
41
|
corpus = ['array', 'of', 'common', 'english', 'words']
|
42
|
-
tokens = ConfidentialInfoRedactorLite::Extractor.new(
|
42
|
+
tokens = ConfidentialInfoRedactorLite::Extractor.new(corpus: corpus).extract(text)
|
43
43
|
# => ["Coca-Cola", "Pepsi", "John Smith"]
|
44
44
|
|
45
45
|
en_dow = %w(monday tuesday wednesday thursday friday saturday sunday)
|
46
46
|
en_dow_abbr = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
47
47
|
en_months = %w(january february march april may june july august september october november december)
|
48
48
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
49
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
49
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
50
50
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
|
51
51
|
|
52
52
|
# You can also just use a specific redactor
|
53
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
53
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
54
54
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
55
55
|
|
56
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
56
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)
|
57
57
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
58
58
|
|
59
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
59
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)
|
60
60
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
|
61
61
|
|
62
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
62
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)
|
63
63
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
|
64
64
|
|
65
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
65
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)
|
66
66
|
# => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
67
67
|
|
68
68
|
# It is possible to 'turn off' any of the specific redactors
|
69
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
69
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
70
70
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
|
71
71
|
|
72
72
|
# It is also possible to change the redaction text
|
73
73
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
74
74
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
75
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
75
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
76
76
|
# => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
|
77
77
|
```
|
78
78
|
|
@@ -5,7 +5,7 @@ module ConfidentialInfoRedactorLite
|
|
5
5
|
# This class redacts various tokens from a text
|
6
6
|
class Redactor
|
7
7
|
# Rubular: http://rubular.com/r/OI2wQZ0KSl
|
8
|
-
NUMBER_REGEX = /(?<=\A|\A\()[^(]?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s|\s\()[^(]?\d+((,|\.)*\d)*(?=(\D?\s|\s|\.?\s|\.$))|(?<=\s)\d+(nd|th|st)|(?<=\s)\d+\/\d+\"*(?=\s)|(?<=\()\S{1}\d+(?=\))|(?<=\s{1})\S{1}\d+\z|^\d+$/
|
8
|
+
NUMBER_REGEX = /(?<=\A|\A\()[^(]?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s|\s\(|\s'|\s‘)[^('‘]?\d+((,|\.)*\d)*(?=(\D?\s|\s|\.?\s|\.$))|(?<=\s)\d+(nd|th|st)|(?<=\s)\d+\/\d+\"*(?=\s)|(?<=\()\S{1}\d+(?=\))|(?<=\s{1})\S{1}\d+\z|^\d+$/
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
@@ -117,7 +117,13 @@ module ConfidentialInfoRedactorLite
|
|
117
117
|
original_sentence_array = txt.split(' ')
|
118
118
|
redacted_sentence_array = redacted_text.split(' ')
|
119
119
|
diff = original_sentence_array - redacted_sentence_array
|
120
|
-
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
120
|
+
final_number_tokens = diff.map { |token| token[-1].eql?('.') ? token[0...-1] : token }
|
121
|
+
.map { |token| token[-1].eql?(')') ? token[0...-1] : token }
|
122
|
+
.map { |token| token[-1].eql?("'") ? token[0...-1] : token }
|
123
|
+
.map { |token| token[-1].eql?('’') ? token[0...-1] : token }
|
124
|
+
.map { |token| token[0].eql?('(') ? token[1..token.length] : token }
|
125
|
+
.map { |token| token[0].eql?("'") ? token[1..token.length] : token }
|
126
|
+
.map { |token| token[0].eql?("‘") ? token[1..token.length] : token }
|
121
127
|
end
|
122
128
|
[redacted_text.gsub(/(?<=[^\>]|\A)#{Regexp.escape(number_text)}/, "<span class='confidentialNumber'>#{number_text}</span>"), final_number_tokens]
|
123
129
|
end
|