confidential_info_redactor_lite 0.0.34 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/date.rb +89 -115
- data/lib/confidential_info_redactor_lite/extractor.rb +44 -27
- data/lib/confidential_info_redactor_lite/hyperlink.rb +3 -11
- data/lib/confidential_info_redactor_lite/redactor.rb +15 -16
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/date_spec.rb +184 -184
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +29 -24
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +4 -4
- data/spec/confidential_info_redactor_lite/performance_spec.rb +16 -10
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +41 -41
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cc0f357a427f4cc05da05abd1d0c89544f18e34
|
4
|
+
data.tar.gz: f76b54f78599ac06388f649a95a48f3bafe9e248
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e79cdf4659e79523dccba90d68b64e6b963511eded9621c7db79f15864292f1508b2034354511199d5e140da71e3421ea3d1bbc5e617d67a0873aa4bb3ae6504
|
7
|
+
data.tar.gz: e2f843ce61d278521a4ebd75e3d71979947f9cd53d0fe4e825ca49d305fb86d53088d96c6bd43238e594ae77369ce4143fe61a2b19f5d704817d47d529c29f74
|
@@ -16,103 +16,33 @@ module ConfidentialInfoRedactorLite
|
|
16
16
|
|
17
17
|
JA_DATE_REGEX_SHORT = /[0123456789]+月[0123456789]+日/
|
18
18
|
|
19
|
-
attr_reader :
|
20
|
-
def initialize(
|
21
|
-
@string = string
|
19
|
+
attr_reader :dow, :dow_abbr, :months, :months_abbr
|
20
|
+
def initialize(dow:, dow_abbr:, months:, months_abbr:)
|
22
21
|
@dow = dow
|
23
22
|
@dow_abbr = dow_abbr
|
24
23
|
@months = months
|
25
24
|
@months_abbr = months_abbr
|
26
25
|
end
|
27
26
|
|
28
|
-
def includes_date?
|
29
|
-
|
27
|
+
def includes_date?(text)
|
28
|
+
includes_long_date?(text) || includes_number_only_date?(text)
|
30
29
|
end
|
31
30
|
|
32
|
-
def replace
|
33
|
-
return
|
34
|
-
new_string = string.dup
|
31
|
+
def replace(text)
|
32
|
+
return text unless is_an_array?
|
35
33
|
counter = 0
|
36
|
-
dow_abbr.
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
new_string = new_string.gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
|
41
|
-
if counter > 0
|
42
|
-
dow_abbr.each do |day|
|
43
|
-
months.each do |month|
|
44
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
45
|
-
end
|
46
|
-
months_abbr.each do |month|
|
47
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
48
|
-
end
|
49
|
-
end
|
50
|
-
dow.each do |day|
|
51
|
-
months.each do |month|
|
52
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
53
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
54
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
55
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
56
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
57
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
58
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
59
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
60
|
-
end
|
61
|
-
months_abbr.each do |month|
|
62
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
63
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
64
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
65
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
66
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
67
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
68
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
69
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
70
|
-
end
|
71
|
-
end
|
72
|
-
else
|
73
|
-
dow.each do |day|
|
74
|
-
months.each do |month|
|
75
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
76
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
77
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
78
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
79
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
80
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
81
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
82
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
83
|
-
end
|
84
|
-
months_abbr.each do |month|
|
85
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
86
|
-
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
87
|
-
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
88
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
89
|
-
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
90
|
-
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
91
|
-
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
92
|
-
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
93
|
-
end
|
94
|
-
end
|
95
|
-
dow_abbr.each do |day|
|
96
|
-
months.each do |month|
|
97
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
98
|
-
end
|
99
|
-
months_abbr.each do |month|
|
100
|
-
new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
105
|
-
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
106
|
-
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
107
|
-
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
34
|
+
dow_abbr.map { |day| counter +=1 if text.include?('day') }
|
35
|
+
text = text.gsub(JA_DATE_REGEX_LONG, '<redacted date>').gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
|
36
|
+
text = redact_dates(counter, text)
|
37
|
+
redact_regex(text)
|
108
38
|
end
|
109
39
|
|
110
|
-
def occurences
|
111
|
-
replace.scan(/<redacted date>/).size
|
40
|
+
def occurences(text)
|
41
|
+
replace(text).scan(/<redacted date>/).size
|
112
42
|
end
|
113
43
|
|
114
|
-
def replace_number_only_date
|
115
|
-
|
44
|
+
def replace_number_only_date(text)
|
45
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
116
46
|
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
117
47
|
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
118
48
|
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
@@ -120,45 +50,89 @@ module ConfidentialInfoRedactorLite
|
|
120
50
|
|
121
51
|
private
|
122
52
|
|
123
|
-
def
|
124
|
-
|
53
|
+
def is_an_array?
|
54
|
+
dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
|
55
|
+
end
|
56
|
+
|
57
|
+
def redact_dates(counter, text)
|
58
|
+
if counter > 0
|
59
|
+
text = redact_dow_abbr(text)
|
60
|
+
text = redact_dow(text)
|
61
|
+
else
|
62
|
+
text = redact_dow(text)
|
63
|
+
text = redact_dow_abbr(text)
|
64
|
+
end
|
65
|
+
text
|
66
|
+
end
|
67
|
+
|
68
|
+
def redact_regex(text)
|
69
|
+
text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
|
70
|
+
.gsub(YMD_YDM_REGEX, ' <redacted date> ')
|
71
|
+
.gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
|
72
|
+
.gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
|
73
|
+
end
|
74
|
+
|
75
|
+
def redact_dow(text)
|
125
76
|
dow.each do |day|
|
126
|
-
months.
|
127
|
-
|
128
|
-
match_found = check_for_matches(day, month)
|
129
|
-
end
|
130
|
-
months_abbr.each do |month|
|
131
|
-
break if match_found
|
132
|
-
match_found = check_for_matches(day, month)
|
133
|
-
end
|
77
|
+
months.map { |month| text = redact_date(text, day, month) }
|
78
|
+
months_abbr.map { |month| text = redact_date(text, day, month) }
|
134
79
|
end
|
80
|
+
text
|
81
|
+
end
|
82
|
+
|
83
|
+
def redact_dow_abbr(text)
|
84
|
+
dow_abbr.each do |day|
|
85
|
+
months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
86
|
+
months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
|
87
|
+
end
|
88
|
+
text
|
89
|
+
end
|
90
|
+
|
91
|
+
def redact_date(text, day, month)
|
92
|
+
text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
93
|
+
.gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
|
94
|
+
.gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
|
95
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
|
96
|
+
.gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
97
|
+
.gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
|
98
|
+
.gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
99
|
+
.gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
|
100
|
+
end
|
101
|
+
|
102
|
+
def includes_long_date?(text)
|
103
|
+
includes_long_date_1?(text) || includes_long_date_2?(text)
|
104
|
+
end
|
105
|
+
|
106
|
+
def includes_long_date_1?(text)
|
107
|
+
dow.each do |day|
|
108
|
+
months.map { |month| return true if check_for_matches(day, month, text) }
|
109
|
+
months_abbr.map { |month| return true if check_for_matches(day, month, text) }
|
110
|
+
end
|
111
|
+
false
|
112
|
+
end
|
113
|
+
|
114
|
+
def includes_long_date_2?(text)
|
135
115
|
dow_abbr.each do |day|
|
136
|
-
months.
|
137
|
-
|
138
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
139
|
-
end
|
140
|
-
months_abbr.each do |month|
|
141
|
-
break if match_found
|
142
|
-
match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
|
143
|
-
end
|
116
|
+
months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
117
|
+
months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
|
144
118
|
end
|
145
|
-
|
119
|
+
false
|
146
120
|
end
|
147
121
|
|
148
|
-
def
|
149
|
-
!(
|
150
|
-
!(
|
151
|
-
!(
|
152
|
-
!(
|
122
|
+
def includes_number_only_date?(text)
|
123
|
+
!(text !~ DMY_MDY_REGEX) ||
|
124
|
+
!(text !~ YMD_YDM_REGEX) ||
|
125
|
+
!(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
|
126
|
+
!(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
|
153
127
|
end
|
154
128
|
|
155
|
-
def check_for_matches(day, month)
|
156
|
-
!(
|
157
|
-
!(
|
158
|
-
!(
|
159
|
-
!(
|
160
|
-
!(
|
161
|
-
!(
|
129
|
+
def check_for_matches(day, month, text)
|
130
|
+
!(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
131
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
|
132
|
+
!(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
|
133
|
+
!(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
|
134
|
+
!(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
|
135
|
+
!(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
|
162
136
|
end
|
163
137
|
end
|
164
138
|
end
|
@@ -5,18 +5,17 @@ module ConfidentialInfoRedactorLite
|
|
5
5
|
EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
|
6
6
|
|
7
7
|
PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
|
8
|
-
attr_reader :
|
9
|
-
def initialize(
|
10
|
-
@text = text.gsub(/[’‘]/, "'").freeze
|
8
|
+
attr_reader :language, :corpus
|
9
|
+
def initialize(corpus:, **args)
|
11
10
|
@corpus = Set.new(corpus).freeze
|
12
11
|
@language = args[:language] || 'en'
|
13
12
|
end
|
14
13
|
|
15
|
-
def extract
|
14
|
+
def extract(text)
|
16
15
|
extracted_terms = []
|
17
|
-
PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
|
16
|
+
PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
|
18
17
|
initial_extracted_terms = extract_preliminary_terms(segment)
|
19
|
-
next if initial_extracted_terms.length.eql?(segment.split(' ').length) &&
|
18
|
+
next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
|
20
19
|
search_ngrams(initial_extracted_terms, extracted_terms)
|
21
20
|
end
|
22
21
|
extracted_terms.uniq.reject(&:empty?)
|
@@ -28,36 +27,54 @@ module ConfidentialInfoRedactorLite
|
|
28
27
|
segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
30
|
+
def in_corpus?(tokens)
|
31
|
+
tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
|
32
|
+
end
|
33
|
+
|
34
|
+
def clean_token(token)
|
35
|
+
token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
|
36
|
+
end
|
37
|
+
|
38
|
+
def non_confidential_token?(token, includes_confidential)
|
39
|
+
corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
|
40
|
+
end
|
41
|
+
|
42
|
+
def singular_in_corpus?(token)
|
43
|
+
corpus.include?(token[0...-1]) &&
|
44
|
+
token[-1].eql?('s')
|
45
|
+
end
|
46
|
+
|
47
|
+
def includes_confidential?(token)
|
48
|
+
token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def matching_first_token?(tokens)
|
53
|
+
corpus.include?(tokens[0]) &&
|
54
|
+
tokens[0] != 'the' &&
|
55
|
+
tokens[0] != 'deutsche' &&
|
56
|
+
tokens.length.eql?(2)
|
57
|
+
end
|
58
|
+
|
59
|
+
def find_extracted_terms(string, extracted_terms)
|
60
|
+
cleaned_token_downcased = clean_token(string.downcase)
|
61
|
+
cleaned_token = clean_token(string)
|
62
|
+
tokens = cleaned_token_downcased.split(' ')
|
63
|
+
if matching_first_token?(tokens)
|
64
|
+
extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
|
65
|
+
else
|
66
|
+
extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
|
39
67
|
end
|
40
|
-
|
68
|
+
extracted_terms
|
41
69
|
end
|
42
70
|
|
43
71
|
def search_ngrams(tokens, extracted_terms)
|
44
72
|
tokens.each do |ngram|
|
45
73
|
ngram.split(PUNCTUATION_REGEX).each do |t|
|
46
74
|
next if !(t !~ /.*\d+.*/)
|
47
|
-
|
48
|
-
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
|
49
|
-
else
|
50
|
-
tracker = true
|
51
|
-
unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
|
52
|
-
t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
|
53
|
-
tracker = false if corpus.include?(token.downcase)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
|
57
|
-
end
|
75
|
+
extracted_terms = find_extracted_terms(t, extracted_terms)
|
58
76
|
end
|
59
77
|
end
|
60
|
-
extracted_terms
|
61
78
|
end
|
62
79
|
end
|
63
80
|
end
|
@@ -4,17 +4,9 @@ module ConfidentialInfoRedactorLite
|
|
4
4
|
# Rubular: http://rubular.com/r/fXa4lp0gfS
|
5
5
|
HYPERLINK_REGEX = /(http|https|www)(\.|:)/
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
-
|
12
|
-
def replace
|
13
|
-
new_string = string.dup
|
14
|
-
string.split(/\s+/).each do |token|
|
15
|
-
new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
|
16
|
-
end
|
17
|
-
new_string
|
7
|
+
def replace(text)
|
8
|
+
text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX) }
|
9
|
+
text
|
18
10
|
end
|
19
11
|
end
|
20
12
|
end
|
@@ -9,9 +9,8 @@ module ConfidentialInfoRedactorLite
|
|
9
9
|
# Rubular: http://rubular.com/r/mxcj2G0Jfa
|
10
10
|
EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
|
11
11
|
|
12
|
-
attr_reader :
|
13
|
-
def initialize(
|
14
|
-
@text = text
|
12
|
+
attr_reader :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
|
13
|
+
def initialize(dow:, dow_abbr:, months:, months_abbr:, **args)
|
15
14
|
@language = args[:language] || 'en'
|
16
15
|
@tokens = args[:tokens]
|
17
16
|
@number_text = args[:number_text] || '<redacted number>'
|
@@ -29,52 +28,52 @@ module ConfidentialInfoRedactorLite
|
|
29
28
|
@months_abbr = months_abbr
|
30
29
|
end
|
31
30
|
|
32
|
-
def dates
|
31
|
+
def dates(text)
|
33
32
|
return '' if text.nil?
|
34
33
|
redact_dates(text)
|
35
34
|
end
|
36
35
|
|
37
|
-
def dates_html
|
36
|
+
def dates_html(text)
|
38
37
|
return [] if text.nil?
|
39
38
|
redact_dates_html(text)
|
40
39
|
end
|
41
40
|
|
42
|
-
def numbers
|
41
|
+
def numbers(text)
|
43
42
|
return '' if text.nil?
|
44
43
|
redact_numbers(text)
|
45
44
|
end
|
46
45
|
|
47
|
-
def numbers_html
|
46
|
+
def numbers_html(text)
|
48
47
|
return [] if text.nil?
|
49
48
|
redact_numbers_html(text)
|
50
49
|
end
|
51
50
|
|
52
|
-
def emails
|
51
|
+
def emails(text)
|
53
52
|
return '' if text.nil?
|
54
53
|
redact_emails(text)
|
55
54
|
end
|
56
55
|
|
57
|
-
def emails_html
|
56
|
+
def emails_html(text)
|
58
57
|
return [] if text.nil?
|
59
58
|
redact_emails_html(text)
|
60
59
|
end
|
61
60
|
|
62
|
-
def hyperlinks
|
61
|
+
def hyperlinks(text)
|
63
62
|
return '' if text.nil?
|
64
63
|
redact_hyperlinks(text)
|
65
64
|
end
|
66
65
|
|
67
|
-
def hyperlinks_html
|
66
|
+
def hyperlinks_html(text)
|
68
67
|
return [] if text.nil?
|
69
68
|
redact_hyperlinks_html(text)
|
70
69
|
end
|
71
70
|
|
72
|
-
def proper_nouns
|
71
|
+
def proper_nouns(text)
|
73
72
|
return '' if text.nil?
|
74
73
|
redact_tokens(text)
|
75
74
|
end
|
76
75
|
|
77
|
-
def redact
|
76
|
+
def redact(text)
|
78
77
|
return '' if text.nil?
|
79
78
|
if ignore_emails
|
80
79
|
redacted_text = text
|
@@ -87,7 +86,7 @@ module ConfidentialInfoRedactorLite
|
|
87
86
|
redact_tokens(redacted_text)
|
88
87
|
end
|
89
88
|
|
90
|
-
def redact_html
|
89
|
+
def redact_html(text)
|
91
90
|
return [] if text.nil?
|
92
91
|
redacted_text = redact_dates_html(text)[0]
|
93
92
|
redacted_text = redact_emails_html(redacted_text)[0]
|
@@ -183,11 +182,11 @@ module ConfidentialInfoRedactorLite
|
|
183
182
|
end
|
184
183
|
|
185
184
|
def redact_hyperlinks(txt)
|
186
|
-
ConfidentialInfoRedactorLite::Hyperlink.new(
|
185
|
+
ConfidentialInfoRedactorLite::Hyperlink.new.replace(txt).gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
|
187
186
|
end
|
188
187
|
|
189
188
|
def redact_dates(txt)
|
190
|
-
ConfidentialInfoRedactorLite::Date.new(
|
189
|
+
ConfidentialInfoRedactorLite::Date.new(dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
|
191
190
|
end
|
192
191
|
|
193
192
|
def redact_numbers(txt)
|