confidential_info_redactor_lite 0.0.34 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/date.rb +89 -115
- data/lib/confidential_info_redactor_lite/extractor.rb +44 -27
- data/lib/confidential_info_redactor_lite/hyperlink.rb +3 -11
- data/lib/confidential_info_redactor_lite/redactor.rb +15 -16
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/date_spec.rb +184 -184
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +29 -24
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +4 -4
- data/spec/confidential_info_redactor_lite/performance_spec.rb +16 -10
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +41 -41
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cc0f357a427f4cc05da05abd1d0c89544f18e34
|
4
|
+
data.tar.gz: f76b54f78599ac06388f649a95a48f3bafe9e248
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e79cdf4659e79523dccba90d68b64e6b963511eded9621c7db79f15864292f1508b2034354511199d5e140da71e3421ea3d1bbc5e617d67a0873aa4bb3ae6504
|
7
|
+
data.tar.gz: e2f843ce61d278521a4ebd75e3d71979947f9cd53d0fe4e825ca49d305fb86d53088d96c6bd43238e594ae77369ce4143fe61a2b19f5d704817d47d529c29f74
|
@@ -16,103 +16,33 @@ module ConfidentialInfoRedactorLite
|
|
16
16
|
|
17
17
|
JA_DATE_REGEX_SHORT = /[0123456789]+月[0123456789]+日/
|
18
18
|
|
19
|
-
attr_reader :
|
20
|
-
def initialize(
|
21
|
-
@string = string
|
19
|
+
attr_reader :dow, :dow_abbr, :months, :months_abbr
|
20
|
+
def initialize(dow:, dow_abbr:, months:, months_abbr:)
|
22
21
|
@dow = dow
|
23
22
|
@dow_abbr = dow_abbr
|
24
23
|
@months = months
|
25
24
|
@months_abbr = months_abbr
|
26
25
|
end
|
27
26
|
|
28
|
-
def includes_date?
|
29
|
-
|
27
|
+
def includes_date?(text)
|
28
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
30
29
|
end
|
31
30
|
|
32
|
-
def replace
|
33
|
-
return
|
34
|
-
new_string = string.dup
|
31
|
+
def replace(text)
|
32
|
+
return text unless is_an_array?
|
35
33
|
counter = 0
|
36
|
-
dow_abbr.
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
new_string = new_string.gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
|
41
|
-
if counter > 0
|
42
|
-
dow_abbr.each do |day|
|
43
|
-
months.each do |month|
|
44
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
45
|
-
end
|
46
|
-
months_abbr.each do |month|
|
47
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
48
|
-
end
|
49
|
-
end
|
50
|
-
dow.each do |day|
|
51
|
-
months.each do |month|
|
52
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
53
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
54
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
55
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
56
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
57
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
58
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
59
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
60
|
-
end
|
61
|
-
months_abbr.each do |month|
|
62
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
63
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
64
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
65
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
66
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
67
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
68
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
69
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
70
|
-
end
|
71
|
-
end
|
72
|
-
else
|
73
|
-
dow.each do |day|
|
74
|
-
months.each do |month|
|
75
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
76
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
77
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
78
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
79
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
80
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
81
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
82
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
83
|
-
end
|
84
|
-
months_abbr.each do |month|
|
85
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
86
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
87
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
88
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
89
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
90
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
91
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
92
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
93
|
-
end
|
94
|
-
end
|
95
|
-
dow_abbr.each do |day|
|
96
|
-
months.each do |month|
|
97
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
98
|
-
end
|
99
|
-
months_abbr.each do |month|
|
100
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
105
|
-
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
106
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
107
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
34
|
+
dow_abbr.map { |day| counter +=1 if text.include?('day') }
|
35
|
+
text = text.gsub(JA_DATE_REGEX_LONG, '<redacted date>').gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
|
36
|
+
text = redact_dates(counter, text)
|
37
|
+
redact_regex(text)
|
108
38
|
end
|
109
39
|
|
110
|
-
def occurences
|
111
|
-
replace.scan(/<redacted date>/).size
|
40
|
+
def occurences(text)
|
41
|
+
replace(text).scan(/<redacted date>/).size
|
112
42
|
end
|
113
43
|
|
114
|
-
def replace_number_only_date
|
115
|
-
|
44
|
+
def replace_number_only_date(text)
|
45
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
116
46
|
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
117
47
|
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
118
48
|
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
@@ -120,45 +50,89 @@ module ConfidentialInfoRedactorLite
|
|
120
50
|
|
121
51
|
private
|
122
52
|
|
123
|
-
def
|
124
|
-
|
53
|
+
def is_an_array?
|
54
|
+
dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
|
55
|
+
end
|
56
|
+
|
57
|
+
def redact_dates(counter, text)
|
58
|
+
if counter > 0
|
59
|
+
text = redact_dow_abbr(text)
|
60
|
+
text = redact_dow(text)
|
61
|
+
else
|
62
|
+
text = redact_dow(text)
|
63
|
+
text = redact_dow_abbr(text)
|
64
|
+
end
|
65
|
+
text
|
66
|
+
end
|
67
|
+
|
68
|
+
def redact_regex(text)
|
69
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
70
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
71
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
72
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
73
|
+
end
|
74
|
+
|
75
|
+
def redact_dow(text)
|
125
76
|
dow.each do |day|
|
126
|
-
months.
|
127
|
-
|
128
|
-
match_found = check_for_matches(day, month)
|
129
|
-
end
|
130
|
-
months_abbr.each do |month|
|
131
|
-
break if match_found
|
132
|
-
match_found = check_for_matches(day, month)
|
133
|
-
end
|
77
|
+
months.map { |month| text = redact_date(text, day, month) }
|
78
|
+
months_abbr.map { |month| text = redact_date(text, day, month) }
|
134
79
|
end
|
80
|
+
text
|
81
|
+
end
|
82
|
+
|
83
|
+
def redact_dow_abbr(text)
|
84
|
+
dow_abbr.each do |day|
|
85
|
+
months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
86
|
+
months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
87
|
+
end
|
88
|
+
text
|
89
|
+
end
|
90
|
+
|
91
|
+
def redact_date(text, day, month)
|
92
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
93
|
+
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
94
|
+
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
95
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
96
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
97
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
98
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
99
|
+
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
100
|
+
end
|
101
|
+
|
102
|
+
def includes_long_date?(text)
|
103
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
104
|
+
end
|
105
|
+
|
106
|
+
def includes_long_date_1?(text)
|
107
|
+
dow.each do |day|
|
108
|
+
months.map { |month| return true if check_for_matches(day, month, text) }
|
109
|
+
months_abbr.map { |month| return true if check_for_matches(day, month, text) }
|
110
|
+
end
|
111
|
+
false
|
112
|
+
end
|
113
|
+
|
114
|
+
def includes_long_date_2?(text)
|
135
115
|
dow_abbr.each do |day|
|
136
|
-
months.
|
137
|
-
|
138
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
139
|
-
end
|
140
|
-
months_abbr.each do |month|
|
141
|
-
break if match_found
|
142
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
143
|
-
end
|
116
|
+
months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
117
|
+
months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
144
118
|
end
|
145
|
-
|
119
|
+
false
|
146
120
|
end
|
147
121
|
|
148
|
-
def
|
149
|
-
!(
|
150
|
-
!(
|
151
|
-
!(
|
152
|
-
!(
|
122
|
+
def includes_number_only_date?(text)
|
123
|
+
!(text !~ DMY_MDY_REGEX) ||
|
124
|
+
!(text !~ YMD_YDM_REGEX) ||
|
125
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
126
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
153
127
|
end
|
154
128
|
|
155
|
-
def check_for_matches(day, month)
|
156
|
-
!(
|
157
|
-
!(
|
158
|
-
!(
|
159
|
-
!(
|
160
|
-
!(
|
161
|
-
!(
|
129
|
+
def check_for_matches(day, month, text)
|
130
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
131
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
132
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
133
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
134
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
135
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
162
136
|
end
|
163
137
|
end
|
164
138
|
end
|
@@ -5,18 +5,17 @@ module ConfidentialInfoRedactorLite
|
|
5
5
|
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
6
6
|
|
7
7
|
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
8
|
-
attr_reader :
|
9
|
-
def initialize(
|
10
|
-
@text = text.gsub(/[’‘]/, "'").freeze
|
8
|
+
attr_reader :language, :corpus
|
9
|
+
def initialize(corpus:, **args)
|
11
10
|
@corpus = Set.new(corpus).freeze
|
12
11
|
@language = args[:language] || 'en'
|
13
12
|
end
|
14
13
|
|
15
|
-
def extract
|
14
|
+
def extract(text)
|
16
15
|
extracted_terms = []
|
17
|
-
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
16
|
+
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
18
17
|
initial_extracted_terms = extract_preliminary_terms(segment)
|
19
|
-
next if initial_extracted_terms.length.eql?(segment.split(' ').length) &&
|
18
|
+
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
|
20
19
|
search_ngrams(initial_extracted_terms, extracted_terms)
|
21
20
|
end
|
22
21
|
extracted_terms.uniq.reject(&:empty?)
|
@@ -28,36 +27,54 @@ module ConfidentialInfoRedactorLite
|
|
28
27
|
segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
30
|
+
def in_corpus?(tokens)
|
31
|
+
tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
|
32
|
+
end
|
33
|
+
|
34
|
+
def clean_token(token)
|
35
|
+
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
36
|
+
end
|
37
|
+
|
38
|
+
def non_confidential_token?(token, includes_confidential)
|
39
|
+
corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
|
40
|
+
end
|
41
|
+
|
42
|
+
def singular_in_corpus?(token)
|
43
|
+
corpus.include?(token[0...-1]) &&
|
44
|
+
token[-1].eql?('s')
|
45
|
+
end
|
46
|
+
|
47
|
+
def includes_confidential?(token)
|
48
|
+
token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def matching_first_token?(tokens)
|
53
|
+
corpus.include?(tokens[0]) &&
|
54
|
+
tokens[0] != 'the' &&
|
55
|
+
tokens[0] != 'deutsche' &&
|
56
|
+
tokens.length.eql?(2)
|
57
|
+
end
|
58
|
+
|
59
|
+
def find_extracted_terms(string, extracted_terms)
|
60
|
+
cleaned_token_downcased = clean_token(string.downcase)
|
61
|
+
cleaned_token = clean_token(string)
|
62
|
+
tokens = cleaned_token_downcased.split(' ')
|
63
|
+
if matching_first_token?(tokens)
|
64
|
+
extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
|
65
|
+
else
|
66
|
+
extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
|
39
67
|
end
|
40
|
-
|
68
|
+
extracted_terms
|
41
69
|
end
|
42
70
|
|
43
71
|
def search_ngrams(tokens, extracted_terms)
|
44
72
|
tokens.each do |ngram|
|
45
73
|
ngram.split(PUNCTUATION_REGEX).each do |t|
|
46
74
|
next if !(t !~ /.*\d+.*/)
|
47
|
-
|
48
|
-
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
|
49
|
-
else
|
50
|
-
tracker = true
|
51
|
-
unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
52
|
-
t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
53
|
-
tracker = false if corpus.include?(token.downcase)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
57
|
-
end
|
75
|
+
extracted_terms = find_extracted_terms(t, extracted_terms)
|
58
76
|
end
|
59
77
|
end
|
60
|
-
extracted_terms
|
61
78
|
end
|
62
79
|
end
|
63
80
|
end
|
@@ -4,17 +4,9 @@ module ConfidentialInfoRedactorLite
|
|
4
4
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
5
5
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
-
|
12
|
-
def replace
|
13
|
-
new_string = string.dup
|
14
|
-
string.split(/\s+/).each do |token|
|
15
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
|
16
|
-
end
|
17
|
-
new_string
|
7
|
+
def replace(text)
|
8
|
+
text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX) }
|
9
|
+
text
|
18
10
|
end
|
19
11
|
end
|
20
12
|
end
|
@@ -9,9 +9,8 @@ module ConfidentialInfoRedactorLite
|
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
12
|
-
attr_reader :
|
13
|
-
def initialize(
|
14
|
-
@text = text
|
12
|
+
attr_reader :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
13
|
+
def initialize(dow:, dow_abbr:, months:, months_abbr:, **args)
|
15
14
|
@language = args[:language] || 'en'
|
16
15
|
@tokens = args[:tokens]
|
17
16
|
@number_text = args[:number_text] || '<redacted number>'
|
@@ -29,52 +28,52 @@ module ConfidentialInfoRedactorLite
|
|
29
28
|
@months_abbr = months_abbr
|
30
29
|
end
|
31
30
|
|
32
|
-
def dates
|
31
|
+
def dates(text)
|
33
32
|
return '' if text.nil?
|
34
33
|
redact_dates(text)
|
35
34
|
end
|
36
35
|
|
37
|
-
def dates_html
|
36
|
+
def dates_html(text)
|
38
37
|
return [] if text.nil?
|
39
38
|
redact_dates_html(text)
|
40
39
|
end
|
41
40
|
|
42
|
-
def numbers
|
41
|
+
def numbers(text)
|
43
42
|
return '' if text.nil?
|
44
43
|
redact_numbers(text)
|
45
44
|
end
|
46
45
|
|
47
|
-
def numbers_html
|
46
|
+
def numbers_html(text)
|
48
47
|
return [] if text.nil?
|
49
48
|
redact_numbers_html(text)
|
50
49
|
end
|
51
50
|
|
52
|
-
def emails
|
51
|
+
def emails(text)
|
53
52
|
return '' if text.nil?
|
54
53
|
redact_emails(text)
|
55
54
|
end
|
56
55
|
|
57
|
-
def emails_html
|
56
|
+
def emails_html(text)
|
58
57
|
return [] if text.nil?
|
59
58
|
redact_emails_html(text)
|
60
59
|
end
|
61
60
|
|
62
|
-
def hyperlinks
|
61
|
+
def hyperlinks(text)
|
63
62
|
return '' if text.nil?
|
64
63
|
redact_hyperlinks(text)
|
65
64
|
end
|
66
65
|
|
67
|
-
def hyperlinks_html
|
66
|
+
def hyperlinks_html(text)
|
68
67
|
return [] if text.nil?
|
69
68
|
redact_hyperlinks_html(text)
|
70
69
|
end
|
71
70
|
|
72
|
-
def proper_nouns
|
71
|
+
def proper_nouns(text)
|
73
72
|
return '' if text.nil?
|
74
73
|
redact_tokens(text)
|
75
74
|
end
|
76
75
|
|
77
|
-
def redact
|
76
|
+
def redact(text)
|
78
77
|
return '' if text.nil?
|
79
78
|
if ignore_emails
|
80
79
|
redacted_text = text
|
@@ -87,7 +86,7 @@ module ConfidentialInfoRedactorLite
|
|
87
86
|
redact_tokens(redacted_text)
|
88
87
|
end
|
89
88
|
|
90
|
-
def redact_html
|
89
|
+
def redact_html(text)
|
91
90
|
return [] if text.nil?
|
92
91
|
redacted_text = redact_dates_html(text)[0]
|
93
92
|
redacted_text = redact_emails_html(redacted_text)[0]
|
@@ -183,11 +182,11 @@ module ConfidentialInfoRedactorLite
|
|
183
182
|
end
|
184
183
|
|
185
184
|
def redact_hyperlinks(txt)
|
186
|
-
ConfidentialInfoRedactorLite::Hyperlink.new(
|
185
|
+
ConfidentialInfoRedactorLite::Hyperlink.new.replace(txt).gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
|
187
186
|
end
|
188
187
|
|
189
188
|
def redact_dates(txt)
|
190
|
-
ConfidentialInfoRedactorLite::Date.new(
|
189
|
+
ConfidentialInfoRedactorLite::Date.new(dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
|
191
190
|
end
|
192
191
|
|
193
192
|
def redact_numbers(txt)
|