confidential_info_redactor 0.0.18 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -11
- data/lib/confidential_info_redactor/date.rb +98 -118
- data/lib/confidential_info_redactor/extractor.rb +66 -33
- data/lib/confidential_info_redactor/hyperlink.rb +5 -17
- data/lib/confidential_info_redactor/redactor.rb +13 -13
- data/lib/confidential_info_redactor/version.rb +1 -1
- data/lib/confidential_info_redactor/word_lists.rb +2 -2
- data/lib/confidential_info_redactor.rb +2 -1
- data/spec/confidential_info_redactor/date_spec.rb +88 -88
- data/spec/confidential_info_redactor/extractor_spec.rb +20 -20
- data/spec/confidential_info_redactor/hyperlink_spec.rb +18 -18
- data/spec/confidential_info_redactor/redactor_spec.rb +22 -22
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5b5d282da6e72d809ac29188c1423775d513931
|
4
|
+
data.tar.gz: 09252aaf2b2e49f1360d03470d38c1d82a6e968d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50c47839331dba86546a92929140d5156599a8699a0d0ecb79f2f0cb767837c206ac14784c661818a1519d2f06b7c0323561477e868ee63aae94580acc191929
|
7
|
+
data.tar.gz: acfc20dd05bf76f7ed8b719643afb3017a302fd6d1cd1eb1f68060513a1059d1efb19c68efdb935c40860afea8d42c63bdc9a28ae99df52eaef0a4a5cdcc59dd
|
data/README.md
CHANGED
@@ -40,45 +40,45 @@ gem 'confidential_info_redactor'
|
|
40
40
|
```ruby
|
41
41
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
42
42
|
|
43
|
-
tokens = ConfidentialInfoRedactor::Extractor.new(text
|
43
|
+
tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
|
44
44
|
# => ["Coca-Cola", "Pepsi", "John Smith"]
|
45
45
|
|
46
|
-
ConfidentialInfoRedactor::Redactor.new(
|
46
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
|
47
47
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
|
48
48
|
|
49
49
|
# You can also just use a specific redactor
|
50
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
50
|
+
ConfidentialInfoRedactor::Redactor.new.dates(text)
|
51
51
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
52
52
|
|
53
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
53
|
+
ConfidentialInfoRedactor::Redactor.new.numbers(text)
|
54
54
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
55
55
|
|
56
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
56
|
+
ConfidentialInfoRedactor::Redactor.new.emails(text)
|
57
57
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
|
58
58
|
|
59
|
-
ConfidentialInfoRedactor::Redactor.new(text
|
59
|
+
ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
|
60
60
|
# => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
|
61
61
|
|
62
|
-
ConfidentialInfoRedactor::Redactor.new(
|
62
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
|
63
63
|
# => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
64
64
|
|
65
65
|
# It is possible to 'turn off' any of the specific redactors
|
66
|
-
ConfidentialInfoRedactor::Redactor.new(
|
66
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
|
67
67
|
# => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
|
68
68
|
|
69
69
|
# German Example
|
70
70
|
text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
|
71
71
|
|
72
|
-
tokens = ConfidentialInfoRedactor::Extractor.new(
|
72
|
+
tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text)
|
73
73
|
# => ['Deutschen Bank']
|
74
74
|
|
75
|
-
ConfidentialInfoRedactor::Redactor.new(
|
75
|
+
ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text)
|
76
76
|
# => 'Viele Mitarbeiter der <redacted> suchen eine andere Arbeitsstelle.'
|
77
77
|
|
78
78
|
# It is also possible to change the redaction text
|
79
79
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
80
80
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
81
|
-
ConfidentialInfoRedactor::Redactor.new(
|
81
|
+
ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)
|
82
82
|
# => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
|
83
83
|
```
|
84
84
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
module ConfidentialInfoRedactor
|
2
2
|
class Date
|
3
|
-
EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
|
4
|
-
EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
|
5
|
-
EN_MONTHS = %w(january february march april may june july august september october november december)
|
6
|
-
EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
7
|
-
|
8
|
-
DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend)
|
9
|
-
DE_DOW_ABBR = %w(mo di mi do fr sa so)
|
10
|
-
DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember)
|
11
|
-
DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez)
|
3
|
+
EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze
|
4
|
+
EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze
|
5
|
+
EN_MONTHS = %w(january february march april may june july august september october november december).freeze
|
6
|
+
EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze
|
7
|
+
|
8
|
+
DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze
|
9
|
+
DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze
|
10
|
+
DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze
|
11
|
+
DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze
|
12
12
|
# Rubular: http://rubular.com/r/73CZ2HU0q6
|
13
13
|
DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/
|
14
14
|
|
@@ -21,9 +21,8 @@ module ConfidentialInfoRedactor
|
|
21
21
|
# Rubular: http://rubular.com/r/mpVSeaKwdY
|
22
22
|
DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/
|
23
23
|
|
24
|
-
attr_reader :
|
25
|
-
def initialize(
|
26
|
-
@string = string
|
24
|
+
attr_reader :language, :dow, :dow_abbr, :months, :months_abbr
|
25
|
+
def initialize(language:)
|
27
26
|
@language = language
|
28
27
|
case language
|
29
28
|
when 'en'
|
@@ -44,133 +43,114 @@ module ConfidentialInfoRedactor
|
|
44
43
|
end
|
45
44
|
end
|
46
45
|
|
47
|
-
def includes_date?
|
48
|
-
|
46
|
+
def includes_date?(text)
|
47
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
49
48
|
end
|
50
49
|
|
51
|
-
def replace
|
52
|
-
|
50
|
+
def replace(text)
|
51
|
+
return text unless is_an_array?
|
53
52
|
counter = 0
|
54
|
-
dow_abbr.
|
55
|
-
|
56
|
-
|
53
|
+
dow_abbr.map { |day| counter +=1 if text.include?('day') }
|
54
|
+
text = redact_dates(counter, text)
|
55
|
+
redact_regex(text)
|
56
|
+
end
|
57
|
+
|
58
|
+
def occurences(text)
|
59
|
+
replace(text).scan(/<redacted date>/).size
|
60
|
+
end
|
61
|
+
|
62
|
+
def replace_number_only_date(text)
|
63
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
64
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
65
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
66
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def is_an_array?
|
72
|
+
dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
|
73
|
+
end
|
74
|
+
|
75
|
+
def redact_dates(counter, text)
|
57
76
|
if counter > 0
|
58
|
-
|
59
|
-
|
60
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
61
|
-
end
|
62
|
-
months_abbr.each do |month|
|
63
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
64
|
-
end
|
65
|
-
end
|
66
|
-
dow.each do |day|
|
67
|
-
months.each do |month|
|
68
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
69
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
70
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
71
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
72
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
73
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
74
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
75
|
-
end
|
76
|
-
months_abbr.each do |month|
|
77
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
78
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
79
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
80
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
81
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
82
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
83
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
84
|
-
end
|
85
|
-
end
|
77
|
+
text = redact_dow_abbr(text)
|
78
|
+
text = redact_dow(text)
|
86
79
|
else
|
87
|
-
|
88
|
-
|
89
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
90
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
91
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
92
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
93
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
94
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
95
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
96
|
-
end
|
97
|
-
months_abbr.each do |month|
|
98
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
99
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
100
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
101
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
102
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
103
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
104
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
105
|
-
end
|
106
|
-
end
|
107
|
-
dow_abbr.each do |day|
|
108
|
-
months.each do |month|
|
109
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
110
|
-
end
|
111
|
-
months_abbr.each do |month|
|
112
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
113
|
-
end
|
114
|
-
end
|
80
|
+
text = redact_dow(text)
|
81
|
+
text = redact_dow_abbr(text)
|
115
82
|
end
|
116
|
-
|
117
|
-
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
118
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
119
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
83
|
+
text
|
120
84
|
end
|
121
85
|
|
122
|
-
def
|
123
|
-
|
86
|
+
def redact_regex(text)
|
87
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
88
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
89
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
90
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
124
91
|
end
|
125
92
|
|
126
|
-
def
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
93
|
+
def redact_dow(text)
|
94
|
+
dow.each do |day|
|
95
|
+
months.map { |month| text = redact_date(text, day, month) }
|
96
|
+
months_abbr.map { |month| text = redact_date(text, day, month) }
|
97
|
+
end
|
98
|
+
text
|
131
99
|
end
|
132
100
|
|
133
|
-
|
101
|
+
def redact_dow_abbr(text)
|
102
|
+
dow_abbr.each do |day|
|
103
|
+
months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
104
|
+
months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
105
|
+
end
|
106
|
+
text
|
107
|
+
end
|
134
108
|
|
135
|
-
def
|
136
|
-
|
109
|
+
def redact_date(text, day, month)
|
110
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
111
|
+
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
112
|
+
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
113
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
114
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
115
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
116
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
117
|
+
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
118
|
+
end
|
119
|
+
|
120
|
+
def includes_long_date?(text)
|
121
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
122
|
+
end
|
123
|
+
|
124
|
+
def includes_long_date_1?(text)
|
137
125
|
dow.each do |day|
|
138
|
-
months.
|
139
|
-
|
140
|
-
match_found = check_for_matches(day, month)
|
141
|
-
end
|
142
|
-
months_abbr.each do |month|
|
143
|
-
break if match_found
|
144
|
-
match_found = check_for_matches(day, month)
|
145
|
-
end
|
126
|
+
months.map { |month| return true if check_for_matches(day, month, text) }
|
127
|
+
months_abbr.map { |month| return true if check_for_matches(day, month, text) }
|
146
128
|
end
|
129
|
+
false
|
130
|
+
end
|
131
|
+
|
132
|
+
def includes_long_date_2?(text)
|
147
133
|
dow_abbr.each do |day|
|
148
|
-
months.
|
149
|
-
|
150
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
151
|
-
end
|
152
|
-
months_abbr.each do |month|
|
153
|
-
break if match_found
|
154
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
155
|
-
end
|
134
|
+
months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
135
|
+
months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
156
136
|
end
|
157
|
-
|
137
|
+
false
|
158
138
|
end
|
159
139
|
|
160
|
-
def
|
161
|
-
!(
|
162
|
-
!(
|
163
|
-
!(
|
164
|
-
!(
|
140
|
+
def includes_number_only_date?(text)
|
141
|
+
!(text !~ DMY_MDY_REGEX) ||
|
142
|
+
!(text !~ YMD_YDM_REGEX) ||
|
143
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
144
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
165
145
|
end
|
166
146
|
|
167
|
-
def check_for_matches(day, month)
|
168
|
-
!(
|
169
|
-
!(
|
170
|
-
!(
|
171
|
-
!(
|
172
|
-
!(
|
173
|
-
!(
|
147
|
+
def check_for_matches(day, month, text)
|
148
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
149
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
150
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
151
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
152
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
153
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
174
154
|
end
|
175
155
|
end
|
176
156
|
end
|
@@ -4,10 +4,12 @@ module ConfidentialInfoRedactor
|
|
4
4
|
# This class extracts proper nouns from a text
|
5
5
|
class Extractor
|
6
6
|
# Rubular: http://rubular.com/r/qE0g4r9zR7
|
7
|
-
EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
8
|
+
|
9
|
+
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
10
|
+
|
11
|
+
attr_reader :language, :corpus
|
12
|
+
def initialize(**args)
|
11
13
|
@language = args[:language] || 'en'
|
12
14
|
case @language
|
13
15
|
when 'en'
|
@@ -19,38 +21,69 @@ module ConfidentialInfoRedactor
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
|
-
def extract
|
24
|
+
def extract(text)
|
23
25
|
extracted_terms = []
|
24
|
-
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
25
|
-
initial_extracted_terms = segment
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
26
|
+
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
27
|
+
initial_extracted_terms = extract_preliminary_terms(segment)
|
28
|
+
search_ngrams(initial_extracted_terms, extracted_terms)
|
29
|
+
end
|
30
|
+
extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def extract_preliminary_terms(segment)
|
36
|
+
segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
37
|
+
end
|
38
|
+
|
39
|
+
def clean_token(token)
|
40
|
+
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def non_confidential_token?(token, includes_confidential)
|
44
|
+
corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
|
45
|
+
end
|
46
|
+
|
47
|
+
def singular_in_corpus?(token)
|
48
|
+
corpus.include?(token[0...-1]) &&
|
49
|
+
token[-1].eql?('s') ||
|
50
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
|
51
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
|
52
|
+
corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
|
53
|
+
corpus.include?(token[0...-1]) && token[-1].eql?('n')
|
54
|
+
end
|
55
|
+
|
56
|
+
def includes_confidential?(token)
|
57
|
+
token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
def matching_first_token?(tokens)
|
62
|
+
corpus.include?(tokens[0]) &&
|
63
|
+
tokens[0] != 'the' &&
|
64
|
+
tokens[0] != 'deutsche' &&
|
65
|
+
tokens.length.eql?(2)
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_extracted_terms(string, extracted_terms)
|
69
|
+
cleaned_token_downcased = clean_token(string.downcase)
|
70
|
+
cleaned_token = clean_token(string)
|
71
|
+
tokens = cleaned_token_downcased.split(' ')
|
72
|
+
if matching_first_token?(tokens)
|
73
|
+
extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
|
74
|
+
else
|
75
|
+
extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
|
51
76
|
end
|
77
|
+
extracted_terms
|
78
|
+
end
|
52
79
|
|
53
|
-
|
80
|
+
def search_ngrams(tokens, extracted_terms)
|
81
|
+
tokens.each do |ngram|
|
82
|
+
ngram.split(PUNCTUATION_REGEX).each do |t|
|
83
|
+
next if !(t !~ /.*\d+.*/)
|
84
|
+
extracted_terms = find_extracted_terms(t, extracted_terms)
|
85
|
+
end
|
86
|
+
end
|
54
87
|
end
|
55
88
|
end
|
56
89
|
end
|
@@ -7,25 +7,13 @@ module ConfidentialInfoRedactor
|
|
7
7
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
8
8
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
@string = string
|
10
|
+
def hyperlink?(text)
|
11
|
+
!(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
|
13
12
|
end
|
14
13
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
def replace
|
20
|
-
new_string = string.dup
|
21
|
-
string.split(/\s+/).each do |token|
|
22
|
-
if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
|
23
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
|
24
|
-
elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
|
25
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
|
26
|
-
end
|
27
|
-
end
|
28
|
-
new_string
|
14
|
+
def replace(text)
|
15
|
+
text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ') if !(token !~ HYPERLINK_REGEX) }
|
16
|
+
text
|
29
17
|
end
|
30
18
|
end
|
31
19
|
end
|