confidential_info_redactor 0.0.18 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e355c3ef2477be1d933d8a44e7a46908f38b2fee
4
- data.tar.gz: 87ff07efbbe1d48c575527f5e7ee77f1651b8e65
3
+ metadata.gz: e5b5d282da6e72d809ac29188c1423775d513931
4
+ data.tar.gz: 09252aaf2b2e49f1360d03470d38c1d82a6e968d
5
5
  SHA512:
6
- metadata.gz: 14e81b76b14eb050869778bf3d099ececb2120ba864454a53a08af33452c7465497dd340d2d24cd425371c843529fef1f28bab9c7d6e55854551b4f7686b0c98
7
- data.tar.gz: 110b3fbd098ef5522930f2be7329a9e921876ea015711c06f0c4e57da5e53a2594c9fc75a5962614f7c8be85b03e4163981ccba081690fdfb7ea078535130c82
6
+ metadata.gz: 50c47839331dba86546a92929140d5156599a8699a0d0ecb79f2f0cb767837c206ac14784c661818a1519d2f06b7c0323561477e868ee63aae94580acc191929
7
+ data.tar.gz: acfc20dd05bf76f7ed8b719643afb3017a302fd6d1cd1eb1f68060513a1059d1efb19c68efdb935c40860afea8d42c63bdc9a28ae99df52eaef0a4a5cdcc59dd
data/README.md CHANGED
@@ -40,45 +40,45 @@ gem 'confidential_info_redactor'
40
40
  ```ruby
41
41
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
42
42
 
43
- tokens = ConfidentialInfoRedactor::Extractor.new(text: text).extract
43
+ tokens = ConfidentialInfoRedactor::Extractor.new.extract(text)
44
44
  # => ["Coca-Cola", "Pepsi", "John Smith"]
45
45
 
46
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens).redact
46
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text)
47
47
  # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted> or visit <redacted>.'
48
48
 
49
49
  # You can also just use a specific redactor
50
- ConfidentialInfoRedactor::Redactor.new(text: text).dates
50
+ ConfidentialInfoRedactor::Redactor.new.dates(text)
51
51
  # => 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
52
52
 
53
- ConfidentialInfoRedactor::Redactor.new(text: text).numbers
53
+ ConfidentialInfoRedactor::Redactor.new.numbers(text)
54
54
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December <redacted number>, <redacted number> for <redacted number>. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
55
55
 
56
- ConfidentialInfoRedactor::Redactor.new(text: text).emails
56
+ ConfidentialInfoRedactor::Redactor.new.emails(text)
57
57
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at <redacted> or visit http://www.super-fake-merger.com.'
58
58
 
59
- ConfidentialInfoRedactor::Redactor.new(text: text).hyperlinks
59
+ ConfidentialInfoRedactor::Redactor.new.hyperlinks(text)
60
60
  # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit <redacted>.'
61
61
 
62
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens).proper_nouns
62
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text)
63
63
  # => '<redacted> announced a merger with <redacted> that will happen on December 15th, 2020 for $200,000,000,000. Please contact <redacted> at j.smith@example.com or visit http://www.super-fake-merger.com.'
64
64
 
65
65
  # It is possible to 'turn off' any of the specific redactors
66
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens, ignore_numbers: true).redact
66
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text)
67
67
  # => '<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted> or visit <redacted>.'
68
68
 
69
69
  # German Example
70
70
  text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
71
71
 
72
- tokens = ConfidentialInfoRedactor::Extractor.new(text: text, language: 'de').extract
72
+ tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text)
73
73
  # => ['Deutschen Bank']
74
74
 
75
- ConfidentialInfoRedactor::Redactor.new(text: text, language: 'de', tokens: tokens).redact
75
+ ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text)
76
76
  # => 'Viele Mitarbeiter der <redacted> suchen eine andere Arbeitsstelle.'
77
77
 
78
78
  # It is also possible to change the redaction text
79
79
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
80
80
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
81
- ConfidentialInfoRedactor::Redactor.new(text: text, tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact
81
+ ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)
82
82
  # => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.'
83
83
  ```
84
84
 
@@ -1,14 +1,14 @@
1
1
  module ConfidentialInfoRedactor
2
2
  class Date
3
- EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
4
- EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
5
- EN_MONTHS = %w(january february march april may june july august september october november december)
6
- EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
7
-
8
- DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend)
9
- DE_DOW_ABBR = %w(mo di mi do fr sa so)
10
- DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember)
11
- DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez)
3
+ EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze
4
+ EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze
5
+ EN_MONTHS = %w(january february march april may june july august september october november december).freeze
6
+ EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze
7
+
8
+ DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze
9
+ DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze
10
+ DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze
11
+ DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze
12
12
  # Rubular: http://rubular.com/r/73CZ2HU0q6
13
13
  DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/
14
14
 
@@ -21,9 +21,8 @@ module ConfidentialInfoRedactor
21
21
  # Rubular: http://rubular.com/r/mpVSeaKwdY
22
22
  DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/
23
23
 
24
- attr_reader :string, :language, :dow, :dow_abbr, :months, :months_abbr
25
- def initialize(string:, language:)
26
- @string = string
24
+ attr_reader :language, :dow, :dow_abbr, :months, :months_abbr
25
+ def initialize(language:)
27
26
  @language = language
28
27
  case language
29
28
  when 'en'
@@ -44,133 +43,114 @@ module ConfidentialInfoRedactor
44
43
  end
45
44
  end
46
45
 
47
- def includes_date?
48
- long_date || number_only_date
46
+ def includes_date?(text)
47
+ includes_long_date?(text) || includes_number_only_date?(text)
49
48
  end
50
49
 
51
- def replace
52
- new_string = string.dup
50
+ def replace(text)
51
+ return text unless is_an_array?
53
52
  counter = 0
54
- dow_abbr.each do |day|
55
- counter +=1 if string.include?('day')
56
- end
53
+ dow_abbr.map { |day| counter +=1 if text.include?('day') }
54
+ text = redact_dates(counter, text)
55
+ redact_regex(text)
56
+ end
57
+
58
+ def occurences(text)
59
+ replace(text).scan(/<redacted date>/).size
60
+ end
61
+
62
+ def replace_number_only_date(text)
63
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
64
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
65
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
66
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
67
+ end
68
+
69
+ private
70
+
71
+ def is_an_array?
72
+ dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
73
+ end
74
+
75
+ def redact_dates(counter, text)
57
76
  if counter > 0
58
- dow_abbr.each do |day|
59
- months.each do |month|
60
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
61
- end
62
- months_abbr.each do |month|
63
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
64
- end
65
- end
66
- dow.each do |day|
67
- months.each do |month|
68
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
69
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
70
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
71
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
72
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
73
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
74
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
75
- end
76
- months_abbr.each do |month|
77
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
78
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
79
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
80
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
81
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
82
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
83
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
84
- end
85
- end
77
+ text = redact_dow_abbr(text)
78
+ text = redact_dow(text)
86
79
  else
87
- dow.each do |day|
88
- months.each do |month|
89
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
90
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
91
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
92
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
93
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
94
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
95
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
96
- end
97
- months_abbr.each do |month|
98
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
99
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
100
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
101
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
102
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
103
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
104
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
105
- end
106
- end
107
- dow_abbr.each do |day|
108
- months.each do |month|
109
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
110
- end
111
- months_abbr.each do |month|
112
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
113
- end
114
- end
80
+ text = redact_dow(text)
81
+ text = redact_dow_abbr(text)
115
82
  end
116
- new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
117
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
118
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
119
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
83
+ text
120
84
  end
121
85
 
122
- def occurences
123
- replace.scan(/<redacted date>/).size
86
+ def redact_regex(text)
87
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
88
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
89
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
90
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
124
91
  end
125
92
 
126
- def replace_number_only_date
127
- string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
128
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
129
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
130
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
93
+ def redact_dow(text)
94
+ dow.each do |day|
95
+ months.map { |month| text = redact_date(text, day, month) }
96
+ months_abbr.map { |month| text = redact_date(text, day, month) }
97
+ end
98
+ text
131
99
  end
132
100
 
133
- private
101
+ def redact_dow_abbr(text)
102
+ dow_abbr.each do |day|
103
+ months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
104
+ months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
105
+ end
106
+ text
107
+ end
134
108
 
135
- def long_date
136
- match_found = false
109
+ def redact_date(text, day, month)
110
+ text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
111
+ .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
112
+ .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
113
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
114
+ .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
115
+ .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
116
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
117
+ .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
118
+ end
119
+
120
+ def includes_long_date?(text)
121
+ includes_long_date_1?(text) || includes_long_date_2?(text)
122
+ end
123
+
124
+ def includes_long_date_1?(text)
137
125
  dow.each do |day|
138
- months.each do |month|
139
- break if match_found
140
- match_found = check_for_matches(day, month)
141
- end
142
- months_abbr.each do |month|
143
- break if match_found
144
- match_found = check_for_matches(day, month)
145
- end
126
+ months.map { |month| return true if check_for_matches(day, month, text) }
127
+ months_abbr.map { |month| return true if check_for_matches(day, month, text) }
146
128
  end
129
+ false
130
+ end
131
+
132
+ def includes_long_date_2?(text)
147
133
  dow_abbr.each do |day|
148
- months.each do |month|
149
- break if match_found
150
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
151
- end
152
- months_abbr.each do |month|
153
- break if match_found
154
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
155
- end
134
+ months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
135
+ months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
156
136
  end
157
- match_found
137
+ false
158
138
  end
159
139
 
160
- def number_only_date
161
- !(string !~ DMY_MDY_REGEX) ||
162
- !(string !~ YMD_YDM_REGEX) ||
163
- !(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
164
- !(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
140
+ def includes_number_only_date?(text)
141
+ !(text !~ DMY_MDY_REGEX) ||
142
+ !(text !~ YMD_YDM_REGEX) ||
143
+ !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
144
+ !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
165
145
  end
166
146
 
167
- def check_for_matches(day, month)
168
- !(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
169
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
170
- !(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
171
- !(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
172
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
173
- !(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
147
+ def check_for_matches(day, month, text)
148
+ !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
149
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
150
+ !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
151
+ !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
152
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
153
+ !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
174
154
  end
175
155
  end
176
156
  end
@@ -4,10 +4,12 @@ module ConfidentialInfoRedactor
4
4
  # This class extracts proper nouns from a text
5
5
  class Extractor
6
6
  # Rubular: http://rubular.com/r/qE0g4r9zR7
7
- EXTRACT_REGEX = /(?<=\s|^|\s\")([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\")[i][A-Z][a-z]+/
8
- attr_reader :text, :language, :corpus
9
- def initialize(text:, **args)
10
- @text = text.gsub(/[’‘]/, "'")
7
+ EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
8
+
9
+ PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
10
+
11
+ attr_reader :language, :corpus
12
+ def initialize(**args)
11
13
  @language = args[:language] || 'en'
12
14
  case @language
13
15
  when 'en'
@@ -19,38 +21,69 @@ module ConfidentialInfoRedactor
19
21
  end
20
22
  end
21
23
 
22
- def extract
24
+ def extract(text)
23
25
  extracted_terms = []
24
- PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
25
- initial_extracted_terms = segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '')) }.compact
26
- in_corpus = true
27
- initial_extracted_terms.each do |ngram|
28
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
29
- unless corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip)
30
- in_corpus = false
31
- end
32
- end
33
- end
34
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && in_corpus
35
- initial_extracted_terms.each do |ngram|
36
- ngram.split(/[\?\)\(\!\\\/\"\:\;\,]/).each do |t|
37
- next if !(t !~ /.*\d+.*/)
38
- if corpus.include?(t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
39
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/\'$/, '').gsub(/”/,'').strip.split(' ')[1])
40
- else
41
- tracker = true
42
- unless t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
43
- t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip.split(' ').each do |token|
44
- tracker = false if corpus.include?(token.downcase)
45
- end
46
- end
47
- extracted_terms << t.gsub(/[\?\)\(\!\\\/\"\:\;\,]/, '').gsub(/\'$/, '').gsub(/”/,'').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(/[\?\.\)\(\!\\\/\"\:\;]/, '').gsub(/”/,'').gsub(/\'$/, '').strip[-1].eql?('n'))
48
- end
49
- end
50
- end
26
+ PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
27
+ initial_extracted_terms = extract_preliminary_terms(segment)
28
+ search_ngrams(initial_extracted_terms, extracted_terms)
29
+ end
30
+ extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
31
+ end
32
+
33
+ private
34
+
35
+ def extract_preliminary_terms(segment)
36
+ segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
37
+ end
38
+
39
+ def clean_token(token)
40
+ token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
41
+ end
42
+
43
+ def non_confidential_token?(token, includes_confidential)
44
+ corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
45
+ end
46
+
47
+ def singular_in_corpus?(token)
48
+ corpus.include?(token[0...-1]) &&
49
+ token[-1].eql?('s') ||
50
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
51
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
52
+ corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
53
+ corpus.include?(token[0...-1]) && token[-1].eql?('n')
54
+ end
55
+
56
+ def includes_confidential?(token)
57
+ token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
58
+ true
59
+ end
60
+
61
+ def matching_first_token?(tokens)
62
+ corpus.include?(tokens[0]) &&
63
+ tokens[0] != 'the' &&
64
+ tokens[0] != 'deutsche' &&
65
+ tokens.length.eql?(2)
66
+ end
67
+
68
+ def find_extracted_terms(string, extracted_terms)
69
+ cleaned_token_downcased = clean_token(string.downcase)
70
+ cleaned_token = clean_token(string)
71
+ tokens = cleaned_token_downcased.split(' ')
72
+ if matching_first_token?(tokens)
73
+ extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
74
+ else
75
+ extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
51
76
  end
77
+ extracted_terms
78
+ end
52
79
 
53
- extracted_terms.uniq.reject(&:empty?)
80
+ def search_ngrams(tokens, extracted_terms)
81
+ tokens.each do |ngram|
82
+ ngram.split(PUNCTUATION_REGEX).each do |t|
83
+ next if !(t !~ /.*\d+.*/)
84
+ extracted_terms = find_extracted_terms(t, extracted_terms)
85
+ end
86
+ end
54
87
  end
55
88
  end
56
89
  end
@@ -7,25 +7,13 @@ module ConfidentialInfoRedactor
7
7
  # Rubular: http://rubular.com/r/fXa4lp0gfS
8
8
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
9
9
 
10
- attr_reader :string
11
- def initialize(string:)
12
- @string = string
10
+ def hyperlink?(text)
11
+ !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
13
12
  end
14
13
 
15
- def hyperlink?
16
- !(string !~ URI.regexp) && string !~ NON_HYPERLINK_REGEX && !(string !~ HYPERLINK_REGEX)
17
- end
18
-
19
- def replace
20
- new_string = string.dup
21
- string.split(/\s+/).each do |token|
22
- if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
23
- new_string = new_string.gsub(/#{Regexp.escape(token.split('">')[0].gsub(/\.\z/, ''))}/, ' <redacted> ')
24
- elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
25
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ')
26
- end
27
- end
28
- new_string
14
+ def replace(text)
15
+ text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted> ') if !(token !~ HYPERLINK_REGEX) }
16
+ text
29
17
  end
30
18
  end
31
19
  end