confidential_info_redactor 0.0.18 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -11
- data/lib/confidential_info_redactor/date.rb +98 -118
- data/lib/confidential_info_redactor/extractor.rb +66 -33
- data/lib/confidential_info_redactor/hyperlink.rb +5 -17
- data/lib/confidential_info_redactor/redactor.rb +13 -13
- data/lib/confidential_info_redactor/version.rb +1 -1
- data/lib/confidential_info_redactor/word_lists.rb +2 -2
- data/lib/confidential_info_redactor.rb +2 -1
- data/spec/confidential_info_redactor/date_spec.rb +88 -88
- data/spec/confidential_info_redactor/extractor_spec.rb +20 -20
- data/spec/confidential_info_redactor/hyperlink_spec.rb +18 -18
- data/spec/confidential_info_redactor/redactor_spec.rb +22 -22
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5b5d282da6e72d809ac29188c1423775d513931
|
4
|
+
data.tar.gz: 09252aaf2b2e49f1360d03470d38c1d82a6e968d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50c47839331dba86546a92929140d5156599a8699a0d0ecb79f2f0cb767837c206ac14784c661818a1519d2f06b7c0323561477e868ee63aae94580acc191929
|
7
|
+
data.tar.gz: acfc20dd05bf76f7ed8b719643afb3017a302fd6d1cd1eb1f68060513a1059d1efb19c68efdb935c40860afea8d42c63bdc9a28ae99df52eaef0a4a5cdcc59dd
|
data/README.md
CHANGED
@@ -40,45 +40,45 @@ gem 'confidential_info_redactor'
|
|
40
40
|
```ruby
|
41
41
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
42
42
|
|
43
|
-
tokens = ConfidentialInfoRedactor::Extractor.new(text
|
43
|
+
tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
|
44
44
|
# => ["Coca-Cola", "Pepsi", "John Smith"]
|
45
45
|
|
46
|
-
ConfidentialInfoRedactor::Redactor.new(
|
46
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
|
47
47
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
|
48
48
|
|
49
49
|
# You can also just use a specific redactor
|
50
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
50
|
+
ConfidentialInfoRedactor::Redactor.new.dates(text)
|
51
51
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
52
52
|
|
53
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
53
|
+
ConfidentialInfoRedactor::Redactor.new.numbers(text)
|
54
54
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
55
55
|
|
56
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
56
|
+
ConfidentialInfoRedactor::Redactor.new.emails(text)
|
57
57
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
|
58
58
|
|
59
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
59
|
+
ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
|
60
60
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
|
61
61
|
|
62
|
-
ConfidentialInfoRedactor::Redactor.new(
|
62
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
|
63
63
|
# => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
64
64
|
|
65
65
|
# It is possible to 'turn off' any of the specific redactors
|
66
|
-
ConfidentialInfoRedactor::Redactor.new(
|
66
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
|
67
67
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
|
68
68
|
|
69
69
|
# German Example
|
70
70
|
text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
|
71
71
|
|
72
|
-
tokens = ConfidentialInfoRedactor::Extractor.new(
|
72
|
+
tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text)
|
73
73
|
# => ['Deutschen Bank']
|
74
74
|
|
75
|
-
ConfidentialInfoRedactor::Redactor.new(
|
75
|
+
ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text)
|
76
76
|
# => 'Viele Mitarbeiter der <redacted> suchen eine andere Arbeitsstelle.'
|
77
77
|
|
78
78
|
# It is also possible to change the redaction text
|
79
79
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
80
80
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
81
|
-
ConfidentialInfoRedactor::Redactor.new(
|
81
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)
|
82
82
|
# => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
|
83
83
|
```
|
84
84
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
module ConfidentialInfoRedactor
|
2
2
|
class Date
|
3
|
-
EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
|
4
|
-
EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
5
|
-
EN_MONTHS = %w(january february march april may june july august september october november december)
|
6
|
-
EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
7
|
-
|
8
|
-
DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend)
|
9
|
-
DE_DOW_ABBR = %w(mo di mi do fr sa so)
|
10
|
-
DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember)
|
11
|
-
DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez)
|
3
|
+
EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze
|
4
|
+
EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze
|
5
|
+
EN_MONTHS = %w(january february march april may june july august september october november december).freeze
|
6
|
+
EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze
|
7
|
+
|
8
|
+
DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze
|
9
|
+
DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze
|
10
|
+
DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze
|
11
|
+
DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze
|
12
12
|
# Rubular: http://rubular.com/r/73CZ2HU0q6
|
13
13
|
DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/
|
14
14
|
|
@@ -21,9 +21,8 @@ module ConfidentialInfoRedactor
|
|
21
21
|
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
22
22
|
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/
|
23
23
|
|
24
|
-
attr_reader :
|
25
|
-
def initialize(
|
26
|
-
@string = string
|
24
|
+
attr_reader :language, :dow, :dow_abbr, :months, :months_abbr
|
25
|
+
def initialize(language:)
|
27
26
|
@language = language
|
28
27
|
case language
|
29
28
|
when 'en'
|
@@ -44,133 +43,114 @@ module ConfidentialInfoRedactor
|
|
44
43
|
end
|
45
44
|
end
|
46
45
|
|
47
|
-
def includes_date?
|
48
|
-
|
46
|
+
def includes_date?(text)
|
47
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
49
48
|
end
|
50
49
|
|
51
|
-
def replace
|
52
|
-
|
50
|
+
def replace(text)
|
51
|
+
return text unless is_an_array?
|
53
52
|
counter = 0
|
54
|
-
dow_abbr.
|
55
|
-
|
56
|
-
|
53
|
+
dow_abbr.map { |day| counter +=1 if text.include?('day') }
|
54
|
+
text = redact_dates(counter, text)
|
55
|
+
redact_regex(text)
|
56
|
+
end
|
57
|
+
|
58
|
+
def occurences(text)
|
59
|
+
replace(text).scan(/<redacted date>/).size
|
60
|
+
end
|
61
|
+
|
62
|
+
def replace_number_only_date(text)
|
63
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
64
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
65
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
66
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def is_an_array?
|
72
|
+
dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
|
73
|
+
end
|
74
|
+
|
75
|
+
def redact_dates(counter, text)
|
57
76
|
if counter > 0
|
58
|
-
|
59
|
-
|
60
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
61
|
-
end
|
62
|
-
months_abbr.each do |month|
|
63
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
64
|
-
end
|
65
|
-
end
|
66
|
-
dow.each do |day|
|
67
|
-
months.each do |month|
|
68
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
69
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
70
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
71
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
72
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
73
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
74
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
75
|
-
end
|
76
|
-
months_abbr.each do |month|
|
77
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
78
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
79
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
80
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
81
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
82
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
83
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
84
|
-
end
|
85
|
-
end
|
77
|
+
text = redact_dow_abbr(text)
|
78
|
+
text = redact_dow(text)
|
86
79
|
else
|
87
|
-
|
88
|
-
|
89
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
90
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
91
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
92
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
93
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
94
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
95
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
96
|
-
end
|
97
|
-
months_abbr.each do |month|
|
98
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
99
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
100
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
101
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
102
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
103
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
104
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
105
|
-
end
|
106
|
-
end
|
107
|
-
dow_abbr.each do |day|
|
108
|
-
months.each do |month|
|
109
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
110
|
-
end
|
111
|
-
months_abbr.each do |month|
|
112
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
113
|
-
end
|
114
|
-
end
|
80
|
+
text = redact_dow(text)
|
81
|
+
text = redact_dow_abbr(text)
|
115
82
|
end
|
116
|
-
|
117
|
-
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
118
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
119
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
83
|
+
text
|
120
84
|
end
|
121
85
|
|
122
|
-
def
|
123
|
-
|
86
|
+
def redact_regex(text)
|
87
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
88
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
89
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
90
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
124
91
|
end
|
125
92
|
|
126
|
-
def
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
93
|
+
def redact_dow(text)
|
94
|
+
dow.each do |day|
|
95
|
+
months.map { |month| text = redact_date(text, day, month) }
|
96
|
+
months_abbr.map { |month| text = redact_date(text, day, month) }
|
97
|
+
end
|
98
|
+
text
|
131
99
|
end
|
132
100
|
|
133
|
-
|
101
|
+
def redact_dow_abbr(text)
|
102
|
+
dow_abbr.each do |day|
|
103
|
+
months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
104
|
+
months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
105
|
+
end
|
106
|
+
text
|
107
|
+
end
|
134
108
|
|
135
|
-
def
|
136
|
-
|
109
|
+
def redact_date(text, day, month)
|
110
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
111
|
+
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
112
|
+
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
113
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
114
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
115
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
116
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
117
|
+
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
118
|
+
end
|
119
|
+
|
120
|
+
def includes_long_date?(text)
|
121
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
122
|
+
end
|
123
|
+
|
124
|
+
def includes_long_date_1?(text)
|
137
125
|
dow.each do |day|
|
138
|
-
months.
|
139
|
-
|
140
|
-
match_found = check_for_matches(day, month)
|
141
|
-
end
|
142
|
-
months_abbr.each do |month|
|
143
|
-
break if match_found
|
144
|
-
match_found = check_for_matches(day, month)
|
145
|
-
end
|
126
|
+
months.map { |month| return true if check_for_matches(day, month, text) }
|
127
|
+
months_abbr.map { |month| return true if check_for_matches(day, month, text) }
|
146
128
|
end
|
129
|
+
false
|
130
|
+
end
|
131
|
+
|
132
|
+
def includes_long_date_2?(text)
|
147
133
|
dow_abbr.each do |day|
|
148
|
-
months.
|
149
|
-
|
150
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
151
|
-
end
|
152
|
-
months_abbr.each do |month|
|
153
|
-
break if match_found
|
154
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
155
|
-
end
|
134
|
+
months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
135
|
+
months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
156
136
|
end
|
157
|
-
|
137
|
+
false
|
158
138
|
end
|
159
139
|
|
160
|
-
def
|
161
|
-
!(
|
162
|
-
!(
|
163
|
-
!(
|
164
|
-
!(
|
140
|
+
def includes_number_only_date?(text)
|
141
|
+
!(text !~ DMY_MDY_REGEX) ||
|
142
|
+
!(text !~ YMD_YDM_REGEX) ||
|
143
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
144
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
165
145
|
end
|
166
146
|
|
167
|
-
def check_for_matches(day, month)
|
168
|
-
!(
|
169
|
-
!(
|
170
|
-
!(
|
171
|
-
!(
|
172
|
-
!(
|
173
|
-
!(
|
147
|
+
def check_for_matches(day, month, text)
|
148
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
149
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
150
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
151
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
152
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
153
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
174
154
|
end
|
175
155
|
end
|
176
156
|
end
|
@@ -4,10 +4,12 @@ module ConfidentialInfoRedactor
|
|
4
4
|
# This class extracts proper nouns from a text
|
5
5
|
class Extractor
|
6
6
|
# Rubular: http://rubular.com/r/qE0g4r9zR7
|
7
|
-
EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
8
|
+
|
9
|
+
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
10
|
+
|
11
|
+
attr_reader :language, :corpus
|
12
|
+
def initialize(**args)
|
11
13
|
@language = args[:language] || 'en'
|
12
14
|
case @language
|
13
15
|
when 'en'
|
@@ -19,38 +21,69 @@ module ConfidentialInfoRedactor
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
|
-
def extract
|
24
|
+
def extract(text)
|
23
25
|
extracted_terms = []
|
24
|
-
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
25
|
-
initial_extracted_terms = segment
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
26
|
+
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
27
|
+
initial_extracted_terms = extract_preliminary_terms(segment)
|
28
|
+
search_ngrams(initial_extracted_terms, extracted_terms)
|
29
|
+
end
|
30
|
+
extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def extract_preliminary_terms(segment)
|
36
|
+
segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
37
|
+
end
|
38
|
+
|
39
|
+
def clean_token(token)
|
40
|
+
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def non_confidential_token?(token, includes_confidential)
|
44
|
+
corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
|
45
|
+
end
|
46
|
+
|
47
|
+
def singular_in_corpus?(token)
|
48
|
+
corpus.include?(token[0...-1]) &&
|
49
|
+
token[-1].eql?('s') ||
|
50
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
|
51
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
|
52
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
|
53
|
+
corpus.include?(token[0...-1]) && token[-1].eql?('n')
|
54
|
+
end
|
55
|
+
|
56
|
+
def includes_confidential?(token)
|
57
|
+
token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
def matching_first_token?(tokens)
|
62
|
+
corpus.include?(tokens[0]) &&
|
63
|
+
tokens[0] != 'the' &&
|
64
|
+
tokens[0] != 'deutsche' &&
|
65
|
+
tokens.length.eql?(2)
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_extracted_terms(string, extracted_terms)
|
69
|
+
cleaned_token_downcased = clean_token(string.downcase)
|
70
|
+
cleaned_token = clean_token(string)
|
71
|
+
tokens = cleaned_token_downcased.split(' ')
|
72
|
+
if matching_first_token?(tokens)
|
73
|
+
extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
|
74
|
+
else
|
75
|
+
extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
|
51
76
|
end
|
77
|
+
extracted_terms
|
78
|
+
end
|
52
79
|
|
53
|
-
|
80
|
+
def search_ngrams(tokens, extracted_terms)
|
81
|
+
tokens.each do |ngram|
|
82
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
83
|
+
next if !(t !~ /.*\d+.*/)
|
84
|
+
extracted_terms = find_extracted_terms(t, extracted_terms)
|
85
|
+
end
|
86
|
+
end
|
54
87
|
end
|
55
88
|
end
|
56
89
|
end
|
@@ -7,25 +7,13 @@ module ConfidentialInfoRedactor
|
|
7
7
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
8
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
@string = string
|
10
|
+
def hyperlink?(text)
|
11
|
+
!(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
|
13
12
|
end
|
14
13
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
def replace
|
20
|
-
new_string = string.dup
|
21
|
-
string.split(/\s+/).each do |token|
|
22
|
-
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
|
24
|
-
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
|
26
|
-
end
|
27
|
-
end
|
28
|
-
new_string
|
14
|
+
def replace(text)
|
15
|
+
text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ') if !(token !~ HYPERLINK_REGEX) }
|
16
|
+
text
|
29
17
|
end
|
30
18
|
end
|
31
19
|
end
|