confidential_info_redactor_lite 0.0.34 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 05d26fbe6fe84f3a1f695b05a47ff9ae555cbb06
4
- data.tar.gz: 9be6e91bf1e96e8f237820ac53de547ce534c9d6
3
+ metadata.gz: 9cc0f357a427f4cc05da05abd1d0c89544f18e34
4
+ data.tar.gz: f76b54f78599ac06388f649a95a48f3bafe9e248
5
5
  SHA512:
6
- metadata.gz: 528f42365aadf05514ec5d56d088e838392473c3cb80a8210302fc8a256bcbb9ed98817c2fd6d3034f99f0aef4b47f4ed67a393c654cdaecb84bc034f0eddd3d
7
- data.tar.gz: d1c900aefe94e6a45a1c7c28d8dd61ac23b39f46f6f0ea1076e126312148254aa84349527b0183221e3c4519997a6d865993ddb573e94ee3dfd37e45506042d4
6
+ metadata.gz: e79cdf4659e79523dccba90d68b64e6b963511eded9621c7db79f15864292f1508b2034354511199d5e140da71e3421ea3d1bbc5e617d67a0873aa4bb3ae6504
7
+ data.tar.gz: e2f843ce61d278521a4ebd75e3d71979947f9cd53d0fe4e825ca49d305fb86d53088d96c6bd43238e594ae77369ce4143fe61a2b19f5d704817d47d529c29f74
@@ -16,103 +16,33 @@ module ConfidentialInfoRedactorLite
16
16
 
17
17
  JA_DATE_REGEX_SHORT = /[0123456789]+月[0123456789]+日/
18
18
 
19
- attr_reader :string, :dow, :dow_abbr, :months, :months_abbr
20
- def initialize(string:, dow:, dow_abbr:, months:, months_abbr:)
21
- @string = string
19
+ attr_reader :dow, :dow_abbr, :months, :months_abbr
20
+ def initialize(dow:, dow_abbr:, months:, months_abbr:)
22
21
  @dow = dow
23
22
  @dow_abbr = dow_abbr
24
23
  @months = months
25
24
  @months_abbr = months_abbr
26
25
  end
27
26
 
28
- def includes_date?
29
- long_date || number_only_date
27
+ def includes_date?(text)
28
+ includes_long_date?(text) || includes_number_only_date?(text)
30
29
  end
31
30
 
32
- def replace
33
- return string unless dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
34
- new_string = string.dup
31
+ def replace(text)
32
+ return text unless is_an_array?
35
33
  counter = 0
36
- dow_abbr.each do |day|
37
- counter +=1 if string.include?('day')
38
- end
39
- new_string = new_string.gsub(JA_DATE_REGEX_LONG, '<redacted date>')
40
- new_string = new_string.gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
41
- if counter > 0
42
- dow_abbr.each do |day|
43
- months.each do |month|
44
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
45
- end
46
- months_abbr.each do |month|
47
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
48
- end
49
- end
50
- dow.each do |day|
51
- months.each do |month|
52
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
53
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
54
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
55
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
56
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
57
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
58
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
59
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
60
- end
61
- months_abbr.each do |month|
62
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
63
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
64
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
65
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
66
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
67
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
68
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
69
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
70
- end
71
- end
72
- else
73
- dow.each do |day|
74
- months.each do |month|
75
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
76
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
77
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
78
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
79
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
80
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
81
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
82
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
83
- end
84
- months_abbr.each do |month|
85
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
86
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
87
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
88
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
89
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
90
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
91
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
92
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
93
- end
94
- end
95
- dow_abbr.each do |day|
96
- months.each do |month|
97
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
98
- end
99
- months_abbr.each do |month|
100
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
101
- end
102
- end
103
- end
104
- new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
105
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
106
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
107
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
34
+ dow_abbr.map { |day| counter +=1 if text.include?('day') }
35
+ text = text.gsub(JA_DATE_REGEX_LONG, '<redacted date>').gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
36
+ text = redact_dates(counter, text)
37
+ redact_regex(text)
108
38
  end
109
39
 
110
- def occurences
111
- replace.scan(/<redacted date>/).size
40
+ def occurences(text)
41
+ replace(text).scan(/<redacted date>/).size
112
42
  end
113
43
 
114
- def replace_number_only_date
115
- string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
44
+ def replace_number_only_date(text)
45
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
116
46
  .gsub(YMD_YDM_REGEX, ' <redacted date> ')
117
47
  .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
118
48
  .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
@@ -120,45 +50,89 @@ module ConfidentialInfoRedactorLite
120
50
 
121
51
  private
122
52
 
123
- def long_date
124
- match_found = false
53
+ def is_an_array?
54
+ dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
55
+ end
56
+
57
+ def redact_dates(counter, text)
58
+ if counter > 0
59
+ text = redact_dow_abbr(text)
60
+ text = redact_dow(text)
61
+ else
62
+ text = redact_dow(text)
63
+ text = redact_dow_abbr(text)
64
+ end
65
+ text
66
+ end
67
+
68
+ def redact_regex(text)
69
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
70
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
71
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
72
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
73
+ end
74
+
75
+ def redact_dow(text)
125
76
  dow.each do |day|
126
- months.each do |month|
127
- break if match_found
128
- match_found = check_for_matches(day, month)
129
- end
130
- months_abbr.each do |month|
131
- break if match_found
132
- match_found = check_for_matches(day, month)
133
- end
77
+ months.map { |month| text = redact_date(text, day, month) }
78
+ months_abbr.map { |month| text = redact_date(text, day, month) }
134
79
  end
80
+ text
81
+ end
82
+
83
+ def redact_dow_abbr(text)
84
+ dow_abbr.each do |day|
85
+ months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
86
+ months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
87
+ end
88
+ text
89
+ end
90
+
91
+ def redact_date(text, day, month)
92
+ text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
93
+ .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
94
+ .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
95
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
96
+ .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
97
+ .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
98
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
99
+ .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
100
+ end
101
+
102
+ def includes_long_date?(text)
103
+ includes_long_date_1?(text) || includes_long_date_2?(text)
104
+ end
105
+
106
+ def includes_long_date_1?(text)
107
+ dow.each do |day|
108
+ months.map { |month| return true if check_for_matches(day, month, text) }
109
+ months_abbr.map { |month| return true if check_for_matches(day, month, text) }
110
+ end
111
+ false
112
+ end
113
+
114
+ def includes_long_date_2?(text)
135
115
  dow_abbr.each do |day|
136
- months.each do |month|
137
- break if match_found
138
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
139
- end
140
- months_abbr.each do |month|
141
- break if match_found
142
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
143
- end
116
+ months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
117
+ months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
144
118
  end
145
- match_found
119
+ false
146
120
  end
147
121
 
148
- def number_only_date
149
- !(string !~ DMY_MDY_REGEX) ||
150
- !(string !~ YMD_YDM_REGEX) ||
151
- !(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
152
- !(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
122
+ def includes_number_only_date?(text)
123
+ !(text !~ DMY_MDY_REGEX) ||
124
+ !(text !~ YMD_YDM_REGEX) ||
125
+ !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
126
+ !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
153
127
  end
154
128
 
155
- def check_for_matches(day, month)
156
- !(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
157
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
158
- !(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
159
- !(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
160
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
161
- !(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
129
+ def check_for_matches(day, month, text)
130
+ !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
131
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
132
+ !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
133
+ !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
134
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
135
+ !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
162
136
  end
163
137
  end
164
138
  end
@@ -5,18 +5,17 @@ module ConfidentialInfoRedactorLite
5
5
  EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
6
6
 
7
7
  PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
8
- attr_reader :text, :language, :corpus
9
- def initialize(text:, corpus:, **args)
10
- @text = text.gsub(/[’‘]/, "'").freeze
8
+ attr_reader :language, :corpus
9
+ def initialize(corpus:, **args)
11
10
  @corpus = Set.new(corpus).freeze
12
11
  @language = args[:language] || 'en'
13
12
  end
14
13
 
15
- def extract
14
+ def extract(text)
16
15
  extracted_terms = []
17
- PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
+ PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
18
17
  initial_extracted_terms = extract_preliminary_terms(segment)
19
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && search_for_ngrams(initial_extracted_terms)
18
+ next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
20
19
  search_ngrams(initial_extracted_terms, extracted_terms)
21
20
  end
22
21
  extracted_terms.uniq.reject(&:empty?)
@@ -28,36 +27,54 @@ module ConfidentialInfoRedactorLite
28
27
  segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
29
28
  end
30
29
 
31
- def search_for_ngrams(tokens)
32
- in_corpus = true
33
- tokens.each do |ngram|
34
- ngram.split(PUNCTUATION_REGEX).each do |t|
35
- unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
36
- in_corpus = false
37
- end
38
- end
30
+ def in_corpus?(tokens)
31
+ tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
32
+ end
33
+
34
+ def clean_token(token)
35
+ token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
36
+ end
37
+
38
+ def non_confidential_token?(token, includes_confidential)
39
+ corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
40
+ end
41
+
42
+ def singular_in_corpus?(token)
43
+ corpus.include?(token[0...-1]) &&
44
+ token[-1].eql?('s')
45
+ end
46
+
47
+ def includes_confidential?(token)
48
+ token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
49
+ true
50
+ end
51
+
52
+ def matching_first_token?(tokens)
53
+ corpus.include?(tokens[0]) &&
54
+ tokens[0] != 'the' &&
55
+ tokens[0] != 'deutsche' &&
56
+ tokens.length.eql?(2)
57
+ end
58
+
59
+ def find_extracted_terms(string, extracted_terms)
60
+ cleaned_token_downcased = clean_token(string.downcase)
61
+ cleaned_token = clean_token(string)
62
+ tokens = cleaned_token_downcased.split(' ')
63
+ if matching_first_token?(tokens)
64
+ extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
65
+ else
66
+ extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
39
67
  end
40
- in_corpus
68
+ extracted_terms
41
69
  end
42
70
 
43
71
  def search_ngrams(tokens, extracted_terms)
44
72
  tokens.each do |ngram|
45
73
  ngram.split(PUNCTUATION_REGEX).each do |t|
46
74
  next if !(t !~ /.*\d+.*/)
47
- if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
48
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
49
- else
50
- tracker = true
51
- unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
52
- t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
53
- tracker = false if corpus.include?(token.downcase)
54
- end
55
- end
56
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
57
- end
75
+ extracted_terms = find_extracted_terms(t, extracted_terms)
58
76
  end
59
77
  end
60
- extracted_terms
61
78
  end
62
79
  end
63
80
  end
@@ -4,17 +4,9 @@ module ConfidentialInfoRedactorLite
4
4
  # Rubular: http://rubular.com/r/fXa4lp0gfS
5
5
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
6
6
 
7
- attr_reader :string
8
- def initialize(string:)
9
- @string = string
10
- end
11
-
12
- def replace
13
- new_string = string.dup
14
- string.split(/\s+/).each do |token|
15
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
16
- end
17
- new_string
7
+ def replace(text)
8
+ text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX) }
9
+ text
18
10
  end
19
11
  end
20
12
  end
@@ -9,9 +9,8 @@ module ConfidentialInfoRedactorLite
9
9
  # Rubular: http://rubular.com/r/mxcj2G0Jfa
10
10
  EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
11
11
 
12
- attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
- def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
14
- @text = text
12
+ attr_reader :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
+ def initialize(dow:, dow_abbr:, months:, months_abbr:, **args)
15
14
  @language = args[:language] || 'en'
16
15
  @tokens = args[:tokens]
17
16
  @number_text = args[:number_text] || '<redacted number>'
@@ -29,52 +28,52 @@ module ConfidentialInfoRedactorLite
29
28
  @months_abbr = months_abbr
30
29
  end
31
30
 
32
- def dates
31
+ def dates(text)
33
32
  return '' if text.nil?
34
33
  redact_dates(text)
35
34
  end
36
35
 
37
- def dates_html
36
+ def dates_html(text)
38
37
  return [] if text.nil?
39
38
  redact_dates_html(text)
40
39
  end
41
40
 
42
- def numbers
41
+ def numbers(text)
43
42
  return '' if text.nil?
44
43
  redact_numbers(text)
45
44
  end
46
45
 
47
- def numbers_html
46
+ def numbers_html(text)
48
47
  return [] if text.nil?
49
48
  redact_numbers_html(text)
50
49
  end
51
50
 
52
- def emails
51
+ def emails(text)
53
52
  return '' if text.nil?
54
53
  redact_emails(text)
55
54
  end
56
55
 
57
- def emails_html
56
+ def emails_html(text)
58
57
  return [] if text.nil?
59
58
  redact_emails_html(text)
60
59
  end
61
60
 
62
- def hyperlinks
61
+ def hyperlinks(text)
63
62
  return '' if text.nil?
64
63
  redact_hyperlinks(text)
65
64
  end
66
65
 
67
- def hyperlinks_html
66
+ def hyperlinks_html(text)
68
67
  return [] if text.nil?
69
68
  redact_hyperlinks_html(text)
70
69
  end
71
70
 
72
- def proper_nouns
71
+ def proper_nouns(text)
73
72
  return '' if text.nil?
74
73
  redact_tokens(text)
75
74
  end
76
75
 
77
- def redact
76
+ def redact(text)
78
77
  return '' if text.nil?
79
78
  if ignore_emails
80
79
  redacted_text = text
@@ -87,7 +86,7 @@ module ConfidentialInfoRedactorLite
87
86
  redact_tokens(redacted_text)
88
87
  end
89
88
 
90
- def redact_html
89
+ def redact_html(text)
91
90
  return [] if text.nil?
92
91
  redacted_text = redact_dates_html(text)[0]
93
92
  redacted_text = redact_emails_html(redacted_text)[0]
@@ -183,11 +182,11 @@ module ConfidentialInfoRedactorLite
183
182
  end
184
183
 
185
184
  def redact_hyperlinks(txt)
186
- ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
185
+ ConfidentialInfoRedactorLite::Hyperlink.new.replace(txt).gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
187
186
  end
188
187
 
189
188
  def redact_dates(txt)
190
- ConfidentialInfoRedactorLite::Date.new(string: txt, dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace.gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
189
+ ConfidentialInfoRedactorLite::Date.new(dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
191
190
  end
192
191
 
193
192
  def redact_numbers(txt)