confidential_info_redactor 0.0.18 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e355c3ef2477be1d933d8a44e7a46908f38b2fee
4
- data.tar.gz: 87ff07efbbe1d48c575527f5e7ee77f1651b8e65
3
+ metadata.gz: e5b5d282da6e72d809ac29188c1423775d513931
4
+ data.tar.gz: 09252aaf2b2e49f1360d03470d38c1d82a6e968d
5
5
  SHA512:
6
- metadata.gz: 14e81b76b14eb050869778bf3d099ececb2120ba864454a53a08af33452c7465497dd340d2d24cd425371c843529fef1f28bab9c7d6e55854551b4f7686b0c98
7
- data.tar.gz: 110b3fbd098ef5522930f2be7329a9e921876ea015711c06f0c4e57da5e53a2594c9fc75a5962614f7c8be85b03e4163981ccba081690fdfb7ea078535130c82
6
+ metadata.gz: 50c47839331dba86546a92929140d5156599a8699a0d0ecb79f2f0cb767837c206ac14784c661818a1519d2f06b7c0323561477e868ee63aae94580acc191929
7
+ data.tar.gz: acfc20dd05bf76f7ed8b719643afb3017a302fd6d1cd1eb1f68060513a1059d1efb19c68efdb935c40860afea8d42c63bdc9a28ae99df52eaef0a4a5cdcc59dd
data/README.md CHANGED
@@ -40,45 +40,45 @@ gem 'confidential_info_redactor'
40
40
  ```ruby
41
41
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
42
42
 
43
- tokens = ConfidentialInfoRedactor::Extractor.new(text: text).extract
43
+ tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
44
44
  # => ["Coca-Cola", "Pepsi", "John Smith"]
45
45
 
46
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens).redact
46
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
47
47
  # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
48
48
 
49
49
  # You can also just use a specific redactor
50
- ConfidentialInfoRedactor::Redactor.new(text: text).dates
50
+ ConfidentialInfoRedactor::Redactor.new.dates(text)
51
51
  # => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
52
52
 
53
- ConfidentialInfoRedactor::Redactor.new(text: text).numbers
53
+ ConfidentialInfoRedactor::Redactor.new.numbers(text)
54
54
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
55
55
 
56
- ConfidentialInfoRedactor::Redactor.new(text: text).emails
56
+ ConfidentialInfoRedactor::Redactor.new.emails(text)
57
57
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
58
58
 
59
- ConfidentialInfoRedactor::Redactor.new(text: text).hyperlinks
59
+ ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
60
60
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
61
61
 
62
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens).proper_nouns
62
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
63
63
  # => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
64
64
 
65
65
  # It is possible to 'turn off' any of the specific redactors
66
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens, ignore_numbers: true).redact
66
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
67
67
  # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
68
68
 
69
69
  # German Example
70
70
  text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
71
71
 
72
- tokens = ConfidentialInfoRedactor::Extractor.new(text: text, language: 'de').extract
72
+ tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text)
73
73
  # => ['Deutschen Bank']
74
74
 
75
- ConfidentialInfoRedactor::Redactor.new(text: text, language: 'de', tokens: tokens).redact
75
+ ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text)
76
76
  # => 'Viele Mitarbeiter der <redacted> suchen eine andere Arbeitsstelle.'
77
77
 
78
78
  # It is also possible to change the redaction text
79
79
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
80
80
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
81
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact
81
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)
82
82
  # => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
83
83
  ```
84
84
 
@@ -1,14 +1,14 @@
1
1
  module ConfidentialInfoRedactor
2
2
  class Date
3
- EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
4
- EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
5
- EN_MONTHS = %w(january february march april may june july august september october november december)
6
- EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
7
-
8
- DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend)
9
- DE_DOW_ABBR = %w(mo di mi do fr sa so)
10
- DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember)
11
- DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez)
3
+ EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze
4
+ EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze
5
+ EN_MONTHS = %w(january february march april may june july august september october november december).freeze
6
+ EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze
7
+
8
+ DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze
9
+ DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze
10
+ DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze
11
+ DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze
12
12
  # Rubular: http://rubular.com/r/73CZ2HU0q6
13
13
  DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/
14
14
 
@@ -21,9 +21,8 @@ module ConfidentialInfoRedactor
21
21
  # Rubular: http://rubular.com/r/mpVSeaKwdY
22
22
  DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/
23
23
 
24
- attr_reader :string, :language, :dow, :dow_abbr, :months, :months_abbr
25
- def initialize(string:, language:)
26
- @string = string
24
+ attr_reader :language, :dow, :dow_abbr, :months, :months_abbr
25
+ def initialize(language:)
27
26
  @language = language
28
27
  case language
29
28
  when 'en'
@@ -44,133 +43,114 @@ module ConfidentialInfoRedactor
44
43
  end
45
44
  end
46
45
 
47
- def includes_date?
48
- long_date || number_only_date
46
+ def includes_date?(text)
47
+ includes_long_date?(text) || includes_number_only_date?(text)
49
48
  end
50
49
 
51
- def replace
52
- new_string = string.dup
50
+ def replace(text)
51
+ return text unless is_an_array?
53
52
  counter = 0
54
- dow_abbr.each do |day|
55
- counter +=1 if string.include?('day')
56
- end
53
+ dow_abbr.map { |day| counter +=1 if text.include?('day') }
54
+ text = redact_dates(counter, text)
55
+ redact_regex(text)
56
+ end
57
+
58
+ def occurences(text)
59
+ replace(text).scan(/<redacted date>/).size
60
+ end
61
+
62
+ def replace_number_only_date(text)
63
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
64
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
65
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
66
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
67
+ end
68
+
69
+ private
70
+
71
+ def is_an_array?
72
+ dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
73
+ end
74
+
75
+ def redact_dates(counter, text)
57
76
  if counter > 0
58
- dow_abbr.each do |day|
59
- months.each do |month|
60
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
61
- end
62
- months_abbr.each do |month|
63
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
64
- end
65
- end
66
- dow.each do |day|
67
- months.each do |month|
68
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
69
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
70
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
71
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
72
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
73
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
74
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
75
- end
76
- months_abbr.each do |month|
77
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
78
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
79
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
80
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
81
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
82
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
83
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
84
- end
85
- end
77
+ text = redact_dow_abbr(text)
78
+ text = redact_dow(text)
86
79
  else
87
- dow.each do |day|
88
- months.each do |month|
89
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
90
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
91
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
92
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
93
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
94
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
95
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
96
- end
97
- months_abbr.each do |month|
98
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
99
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
100
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
101
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
102
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
103
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
104
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
105
- end
106
- end
107
- dow_abbr.each do |day|
108
- months.each do |month|
109
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
110
- end
111
- months_abbr.each do |month|
112
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
113
- end
114
- end
80
+ text = redact_dow(text)
81
+ text = redact_dow_abbr(text)
115
82
  end
116
- new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
117
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
118
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
119
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
83
+ text
120
84
  end
121
85
 
122
- def occurences
123
- replace.scan(/<redacted date>/).size
86
+ def redact_regex(text)
87
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
88
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
89
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
90
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
124
91
  end
125
92
 
126
- def replace_number_only_date
127
- string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
128
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
129
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
130
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
93
+ def redact_dow(text)
94
+ dow.each do |day|
95
+ months.map { |month| text = redact_date(text, day, month) }
96
+ months_abbr.map { |month| text = redact_date(text, day, month) }
97
+ end
98
+ text
131
99
  end
132
100
 
133
- private
101
+ def redact_dow_abbr(text)
102
+ dow_abbr.each do |day|
103
+ months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
104
+ months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
105
+ end
106
+ text
107
+ end
134
108
 
135
- def long_date
136
- match_found = false
109
+ def redact_date(text, day, month)
110
+ text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
111
+ .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
112
+ .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
113
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
114
+ .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
115
+ .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
116
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
117
+ .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
118
+ end
119
+
120
+ def includes_long_date?(text)
121
+ includes_long_date_1?(text) || includes_long_date_2?(text)
122
+ end
123
+
124
+ def includes_long_date_1?(text)
137
125
  dow.each do |day|
138
- months.each do |month|
139
- break if match_found
140
- match_found = check_for_matches(day, month)
141
- end
142
- months_abbr.each do |month|
143
- break if match_found
144
- match_found = check_for_matches(day, month)
145
- end
126
+ months.map { |month| return true if check_for_matches(day, month, text) }
127
+ months_abbr.map { |month| return true if check_for_matches(day, month, text) }
146
128
  end
129
+ false
130
+ end
131
+
132
+ def includes_long_date_2?(text)
147
133
  dow_abbr.each do |day|
148
- months.each do |month|
149
- break if match_found
150
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
151
- end
152
- months_abbr.each do |month|
153
- break if match_found
154
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
155
- end
134
+ months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
135
+ months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
156
136
  end
157
- match_found
137
+ false
158
138
  end
159
139
 
160
- def number_only_date
161
- !(string !~ DMY_MDY_REGEX) ||
162
- !(string !~ YMD_YDM_REGEX) ||
163
- !(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
164
- !(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
140
+ def includes_number_only_date?(text)
141
+ !(text !~ DMY_MDY_REGEX) ||
142
+ !(text !~ YMD_YDM_REGEX) ||
143
+ !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
144
+ !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
165
145
  end
166
146
 
167
- def check_for_matches(day, month)
168
- !(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
169
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
170
- !(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
171
- !(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
172
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
173
- !(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
147
+ def check_for_matches(day, month, text)
148
+ !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
149
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
150
+ !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
151
+ !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
152
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
153
+ !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
174
154
  end
175
155
  end
176
156
  end
@@ -4,10 +4,12 @@ module ConfidentialInfoRedactor
4
4
  # This class extracts proper nouns from a text
5
5
  class Extractor
6
6
  # Rubular: http://rubular.com/r/qE0g4r9zR7
7
- EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
8
- attr_reader :text, :language, :corpus
9
- def initialize(text:, **args)
10
- @text = text.gsub(/[’‘]/, "'")
7
+ EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
8
+
9
+ PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
10
+
11
+ attr_reader :language, :corpus
12
+ def initialize(**args)
11
13
  @language = args[:language] || 'en'
12
14
  case @language
13
15
  when 'en'
@@ -19,38 +21,69 @@ module ConfidentialInfoRedactor
19
21
  end
20
22
  end
21
23
 
22
- def extract
24
+ def extract(text)
23
25
  extracted_terms = []
24
- PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
25
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
26
- in_corpus = true
27
- initial_extracted_terms.each do |ngram|
28
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
29
- unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
30
- in_corpus = false
31
- end
32
- end
33
- end
34
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
35
- initial_extracted_terms.each do |ngram|
36
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
37
- next if !(t !~ /.*\d+.*/)
38
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
39
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
40
- else
41
- tracker = true
42
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
43
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
44
- tracker = false if corpus.include?(token.downcase)
45
- end
46
- end
47
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
48
- end
49
- end
50
- end
26
+ PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
27
+ initial_extracted_terms = extract_preliminary_terms(segment)
28
+ search_ngrams(initial_extracted_terms, extracted_terms)
29
+ end
30
+ extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
31
+ end
32
+
33
+ private
34
+
35
+ def extract_preliminary_terms(segment)
36
+ segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
37
+ end
38
+
39
+ def clean_token(token)
40
+ token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
41
+ end
42
+
43
+ def non_confidential_token?(token, includes_confidential)
44
+ corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
45
+ end
46
+
47
+ def singular_in_corpus?(token)
48
+ corpus.include?(token[0...-1]) &&
49
+ token[-1].eql?('s') ||
50
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
51
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
52
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
53
+ corpus.include?(token[0...-1]) && token[-1].eql?('n')
54
+ end
55
+
56
+ def includes_confidential?(token)
57
+ token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
58
+ true
59
+ end
60
+
61
+ def matching_first_token?(tokens)
62
+ corpus.include?(tokens[0]) &&
63
+ tokens[0] != 'the' &&
64
+ tokens[0] != 'deutsche' &&
65
+ tokens.length.eql?(2)
66
+ end
67
+
68
+ def find_extracted_terms(string, extracted_terms)
69
+ cleaned_token_downcased = clean_token(string.downcase)
70
+ cleaned_token = clean_token(string)
71
+ tokens = cleaned_token_downcased.split(' ')
72
+ if matching_first_token?(tokens)
73
+ extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
74
+ else
75
+ extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
51
76
  end
77
+ extracted_terms
78
+ end
52
79
 
53
- extracted_terms.uniq.reject(&:empty?)
80
+ def search_ngrams(tokens, extracted_terms)
81
+ tokens.each do |ngram|
82
+ ngram.split(PUNCTUATION_REGEX).each do |t|
83
+ next if !(t !~ /.*\d+.*/)
84
+ extracted_terms = find_extracted_terms(t, extracted_terms)
85
+ end
86
+ end
54
87
  end
55
88
  end
56
89
  end
@@ -7,25 +7,13 @@ module ConfidentialInfoRedactor
7
7
  # Rubular: http://rubular.com/r/fXa4lp0gfS
8
8
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
9
9
 
10
- attr_reader :string
11
- def initialize(string:)
12
- @string = string
10
+ def hyperlink?(text)
11
+ !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
13
12
  end
14
13
 
15
- def hyperlink?
16
- !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
17
- end
18
-
19
- def replace
20
- new_string = string.dup
21
- string.split(/\s+/).each do |token|
22
- if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
24
- elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
26
- end
27
- end
28
- new_string
14
+ def replace(text)
15
+ text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ') if !(token !~ HYPERLINK_REGEX) }
16
+ text
29
17
  end
30
18
  end
31
19
  end