confidential_info_redactor_lite 0.0.34 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 05d26fbe6fe84f3a1f695b05a47ff9ae555cbb06
4
- data.tar.gz: 9be6e91bf1e96e8f237820ac53de547ce534c9d6
3
+ metadata.gz: 9cc0f357a427f4cc05da05abd1d0c89544f18e34
4
+ data.tar.gz: f76b54f78599ac06388f649a95a48f3bafe9e248
5
5
  SHA512:
6
- metadata.gz: 528f42365aadf05514ec5d56d088e838392473c3cb80a8210302fc8a256bcbb9ed98817c2fd6d3034f99f0aef4b47f4ed67a393c654cdaecb84bc034f0eddd3d
7
- data.tar.gz: d1c900aefe94e6a45a1c7c28d8dd61ac23b39f46f6f0ea1076e126312148254aa84349527b0183221e3c4519997a6d865993ddb573e94ee3dfd37e45506042d4
6
+ metadata.gz: e79cdf4659e79523dccba90d68b64e6b963511eded9621c7db79f15864292f1508b2034354511199d5e140da71e3421ea3d1bbc5e617d67a0873aa4bb3ae6504
7
+ data.tar.gz: e2f843ce61d278521a4ebd75e3d71979947f9cd53d0fe4e825ca49d305fb86d53088d96c6bd43238e594ae77369ce4143fe61a2b19f5d704817d47d529c29f74
@@ -16,103 +16,33 @@ module ConfidentialInfoRedactorLite
16
16
 
17
17
  JA_DATE_REGEX_SHORT = /[0123456789]+月[0123456789]+日/
18
18
 
19
- attr_reader :string, :dow, :dow_abbr, :months, :months_abbr
20
- def initialize(string:, dow:, dow_abbr:, months:, months_abbr:)
21
- @string = string
19
+ attr_reader :dow, :dow_abbr, :months, :months_abbr
20
+ def initialize(dow:, dow_abbr:, months:, months_abbr:)
22
21
  @dow = dow
23
22
  @dow_abbr = dow_abbr
24
23
  @months = months
25
24
  @months_abbr = months_abbr
26
25
  end
27
26
 
28
- def includes_date?
29
- long_date || number_only_date
27
+ def includes_date?(text)
28
+ includes_long_date?(text) || includes_number_only_date?(text)
30
29
  end
31
30
 
32
- def replace
33
- return string unless dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
34
- new_string = string.dup
31
+ def replace(text)
32
+ return text unless is_an_array?
35
33
  counter = 0
36
- dow_abbr.each do |day|
37
- counter +=1 if string.include?('day')
38
- end
39
- new_string = new_string.gsub(JA_DATE_REGEX_LONG, '<redacted date>')
40
- new_string = new_string.gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
41
- if counter > 0
42
- dow_abbr.each do |day|
43
- months.each do |month|
44
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
45
- end
46
- months_abbr.each do |month|
47
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
48
- end
49
- end
50
- dow.each do |day|
51
- months.each do |month|
52
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
53
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
54
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
55
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
56
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
57
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
58
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
59
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
60
- end
61
- months_abbr.each do |month|
62
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
63
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
64
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
65
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
66
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
67
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
68
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
69
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
70
- end
71
- end
72
- else
73
- dow.each do |day|
74
- months.each do |month|
75
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
76
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
77
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
78
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
79
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
80
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
81
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
82
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
83
- end
84
- months_abbr.each do |month|
85
- new_string = new_string.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
86
- .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
87
- .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
88
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
89
- .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
90
- .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
91
- .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
92
- .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
93
- end
94
- end
95
- dow_abbr.each do |day|
96
- months.each do |month|
97
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
98
- end
99
- months_abbr.each do |month|
100
- new_string = new_string.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
101
- end
102
- end
103
- end
104
- new_string = new_string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
105
- .gsub(YMD_YDM_REGEX, ' <redacted date> ')
106
- .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
107
- .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
34
+ dow_abbr.map { |day| counter +=1 if text.include?('day') }
35
+ text = text.gsub(JA_DATE_REGEX_LONG, '<redacted date>').gsub(JA_DATE_REGEX_SHORT, '<redacted date>')
36
+ text = redact_dates(counter, text)
37
+ redact_regex(text)
108
38
  end
109
39
 
110
- def occurences
111
- replace.scan(/<redacted date>/).size
40
+ def occurences(text)
41
+ replace(text).scan(/<redacted date>/).size
112
42
  end
113
43
 
114
- def replace_number_only_date
115
- string.gsub(DMY_MDY_REGEX, ' <redacted date> ')
44
+ def replace_number_only_date(text)
45
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
116
46
  .gsub(YMD_YDM_REGEX, ' <redacted date> ')
117
47
  .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
118
48
  .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
@@ -120,45 +50,89 @@ module ConfidentialInfoRedactorLite
120
50
 
121
51
  private
122
52
 
123
- def long_date
124
- match_found = false
53
+ def is_an_array?
54
+ dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array)
55
+ end
56
+
57
+ def redact_dates(counter, text)
58
+ if counter > 0
59
+ text = redact_dow_abbr(text)
60
+ text = redact_dow(text)
61
+ else
62
+ text = redact_dow(text)
63
+ text = redact_dow_abbr(text)
64
+ end
65
+ text
66
+ end
67
+
68
+ def redact_regex(text)
69
+ text.gsub(DMY_MDY_REGEX, ' <redacted date> ')
70
+ .gsub(YMD_YDM_REGEX, ' <redacted date> ')
71
+ .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' <redacted date> ')
72
+ .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' <redacted date> ')
73
+ end
74
+
75
+ def redact_dow(text)
125
76
  dow.each do |day|
126
- months.each do |month|
127
- break if match_found
128
- match_found = check_for_matches(day, month)
129
- end
130
- months_abbr.each do |month|
131
- break if match_found
132
- match_found = check_for_matches(day, month)
133
- end
77
+ months.map { |month| text = redact_date(text, day, month) }
78
+ months_abbr.map { |month| text = redact_date(text, day, month) }
134
79
  end
80
+ text
81
+ end
82
+
83
+ def redact_dow_abbr(text)
84
+ dow_abbr.each do |day|
85
+ months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
86
+ months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ') }
87
+ end
88
+ text
89
+ end
90
+
91
+ def redact_date(text, day, month)
92
+ text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
93
+ .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' <redacted date> ')
94
+ .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' <redacted date> ')
95
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' <redacted date> ')
96
+ .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' <redacted date> ')
97
+ .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' <redacted date> ')
98
+ .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' <redacted date> ')
99
+ .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' <redacted date> ')
100
+ end
101
+
102
+ def includes_long_date?(text)
103
+ includes_long_date_1?(text) || includes_long_date_2?(text)
104
+ end
105
+
106
+ def includes_long_date_1?(text)
107
+ dow.each do |day|
108
+ months.map { |month| return true if check_for_matches(day, month, text) }
109
+ months_abbr.map { |month| return true if check_for_matches(day, month, text) }
110
+ end
111
+ false
112
+ end
113
+
114
+ def includes_long_date_2?(text)
135
115
  dow_abbr.each do |day|
136
- months.each do |month|
137
- break if match_found
138
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i)
139
- end
140
- months_abbr.each do |month|
141
- break if match_found
142
- match_found = !(string !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i)
143
- end
116
+ months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
117
+ months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
144
118
  end
145
- match_found
119
+ false
146
120
  end
147
121
 
148
- def number_only_date
149
- !(string !~ DMY_MDY_REGEX) ||
150
- !(string !~ YMD_YDM_REGEX) ||
151
- !(string !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
152
- !(string !~ DIGIT_ONLY_YEAR_LAST_REGEX)
122
+ def includes_number_only_date?(text)
123
+ !(text !~ DMY_MDY_REGEX) ||
124
+ !(text !~ YMD_YDM_REGEX) ||
125
+ !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
126
+ !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
153
127
  end
154
128
 
155
- def check_for_matches(day, month)
156
- !(string !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
157
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
158
- !(string !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
159
- !(string !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
160
- !(string !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
161
- !(string !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
129
+ def check_for_matches(day, month, text)
130
+ !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
131
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
132
+ !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
133
+ !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
134
+ !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
135
+ !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
162
136
  end
163
137
  end
164
138
  end
@@ -5,18 +5,17 @@ module ConfidentialInfoRedactorLite
5
5
  EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/
6
6
 
7
7
  PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/
8
- attr_reader :text, :language, :corpus
9
- def initialize(text:, corpus:, **args)
10
- @text = text.gsub(/[’‘]/, "'").freeze
8
+ attr_reader :language, :corpus
9
+ def initialize(corpus:, **args)
11
10
  @corpus = Set.new(corpus).freeze
12
11
  @language = args[:language] || 'en'
13
12
  end
14
13
 
15
- def extract
14
+ def extract(text)
16
15
  extracted_terms = []
17
- PragmaticSegmenter::Segmenter.new(text: text, language: language).segment.each do |segment|
16
+ PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
18
17
  initial_extracted_terms = extract_preliminary_terms(segment)
19
- next if initial_extracted_terms.length.eql?(segment.split(' ').length) && search_for_ngrams(initial_extracted_terms)
18
+ next if initial_extracted_terms.length.eql?(segment.split(' ').length) && !in_corpus?(initial_extracted_terms)
20
19
  search_ngrams(initial_extracted_terms, extracted_terms)
21
20
  end
22
21
  extracted_terms.uniq.reject(&:empty?)
@@ -28,36 +27,54 @@ module ConfidentialInfoRedactorLite
28
27
  segment.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
29
28
  end
30
29
 
31
- def search_for_ngrams(tokens)
32
- in_corpus = true
33
- tokens.each do |ngram|
34
- ngram.split(PUNCTUATION_REGEX).each do |t|
35
- unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip)
36
- in_corpus = false
37
- end
38
- end
30
+ def in_corpus?(tokens)
31
+ tokens.map { |token| token.split(PUNCTUATION_REGEX).map { |t| return true if corpus.include?(clean_token(t.downcase)) } }
32
+ end
33
+
34
+ def clean_token(token)
35
+ token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
36
+ end
37
+
38
+ def non_confidential_token?(token, includes_confidential)
39
+ corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
40
+ end
41
+
42
+ def singular_in_corpus?(token)
43
+ corpus.include?(token[0...-1]) &&
44
+ token[-1].eql?('s')
45
+ end
46
+
47
+ def includes_confidential?(token)
48
+ token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
49
+ true
50
+ end
51
+
52
+ def matching_first_token?(tokens)
53
+ corpus.include?(tokens[0]) &&
54
+ tokens[0] != 'the' &&
55
+ tokens[0] != 'deutsche' &&
56
+ tokens.length.eql?(2)
57
+ end
58
+
59
+ def find_extracted_terms(string, extracted_terms)
60
+ cleaned_token_downcased = clean_token(string.downcase)
61
+ cleaned_token = clean_token(string)
62
+ tokens = cleaned_token_downcased.split(' ')
63
+ if matching_first_token?(tokens)
64
+ extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
65
+ else
66
+ extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
39
67
  end
40
- in_corpus
68
+ extracted_terms
41
69
  end
42
70
 
43
71
  def search_ngrams(tokens, extracted_terms)
44
72
  tokens.each do |ngram|
45
73
  ngram.split(PUNCTUATION_REGEX).each do |t|
46
74
  next if !(t !~ /.*\d+.*/)
47
- if corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'the' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[0] != 'deutsche' && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2)
48
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1] unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip.split(' ')[1])
49
- else
50
- tracker = true
51
- unless t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').length.eql?(2) && t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ')[1].downcase.eql?('bank')
52
- t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip.split(' ').each do |token|
53
- tracker = false if corpus.include?(token.downcase)
54
- end
55
- end
56
- extracted_terms << t.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip unless corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip) || !tracker || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('en')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('es')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-2]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-2..-1].eql?('er')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('s')) || (corpus.include?(t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[0...-1]) && t.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').strip[-1].eql?('n'))
57
- end
75
+ extracted_terms = find_extracted_terms(t, extracted_terms)
58
76
  end
59
77
  end
60
- extracted_terms
61
78
  end
62
79
  end
63
80
  end
@@ -4,17 +4,9 @@ module ConfidentialInfoRedactorLite
4
4
  # Rubular: http://rubular.com/r/fXa4lp0gfS
5
5
  HYPERLINK_REGEX = /(http|https|www)(\.|:)/
6
6
 
7
- attr_reader :string
8
- def initialize(string:)
9
- @string = string
10
- end
11
-
12
- def replace
13
- new_string = string.dup
14
- string.split(/\s+/).each do |token|
15
- new_string = new_string.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX)
16
- end
17
- new_string
7
+ def replace(text)
8
+ text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' <redacted hyperlink> ') if !(token !~ HYPERLINK_REGEX) }
9
+ text
18
10
  end
19
11
  end
20
12
  end
@@ -9,9 +9,8 @@ module ConfidentialInfoRedactorLite
9
9
  # Rubular: http://rubular.com/r/mxcj2G0Jfa
10
10
  EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i
11
11
 
12
- attr_reader :text, :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
- def initialize(text:, dow:, dow_abbr:, months:, months_abbr:, **args)
14
- @text = text
12
+ attr_reader :language, :email_text, :hyperlink_text, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks, :dow, :dow_abbr, :months, :months_abbr
13
+ def initialize(dow:, dow_abbr:, months:, months_abbr:, **args)
15
14
  @language = args[:language] || 'en'
16
15
  @tokens = args[:tokens]
17
16
  @number_text = args[:number_text] || '<redacted number>'
@@ -29,52 +28,52 @@ module ConfidentialInfoRedactorLite
29
28
  @months_abbr = months_abbr
30
29
  end
31
30
 
32
- def dates
31
+ def dates(text)
33
32
  return '' if text.nil?
34
33
  redact_dates(text)
35
34
  end
36
35
 
37
- def dates_html
36
+ def dates_html(text)
38
37
  return [] if text.nil?
39
38
  redact_dates_html(text)
40
39
  end
41
40
 
42
- def numbers
41
+ def numbers(text)
43
42
  return '' if text.nil?
44
43
  redact_numbers(text)
45
44
  end
46
45
 
47
- def numbers_html
46
+ def numbers_html(text)
48
47
  return [] if text.nil?
49
48
  redact_numbers_html(text)
50
49
  end
51
50
 
52
- def emails
51
+ def emails(text)
53
52
  return '' if text.nil?
54
53
  redact_emails(text)
55
54
  end
56
55
 
57
- def emails_html
56
+ def emails_html(text)
58
57
  return [] if text.nil?
59
58
  redact_emails_html(text)
60
59
  end
61
60
 
62
- def hyperlinks
61
+ def hyperlinks(text)
63
62
  return '' if text.nil?
64
63
  redact_hyperlinks(text)
65
64
  end
66
65
 
67
- def hyperlinks_html
66
+ def hyperlinks_html(text)
68
67
  return [] if text.nil?
69
68
  redact_hyperlinks_html(text)
70
69
  end
71
70
 
72
- def proper_nouns
71
+ def proper_nouns(text)
73
72
  return '' if text.nil?
74
73
  redact_tokens(text)
75
74
  end
76
75
 
77
- def redact
76
+ def redact(text)
78
77
  return '' if text.nil?
79
78
  if ignore_emails
80
79
  redacted_text = text
@@ -87,7 +86,7 @@ module ConfidentialInfoRedactorLite
87
86
  redact_tokens(redacted_text)
88
87
  end
89
88
 
90
- def redact_html
89
+ def redact_html(text)
91
90
  return [] if text.nil?
92
91
  redacted_text = redact_dates_html(text)[0]
93
92
  redacted_text = redact_emails_html(redacted_text)[0]
@@ -183,11 +182,11 @@ module ConfidentialInfoRedactorLite
183
182
  end
184
183
 
185
184
  def redact_hyperlinks(txt)
186
- ConfidentialInfoRedactorLite::Hyperlink.new(string: txt).replace.gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
185
+ ConfidentialInfoRedactorLite::Hyperlink.new.replace(txt).gsub(/<redacted hyperlink>/, "#{hyperlink_text}").gsub(/\s*#{Regexp.escape(hyperlink_text)}\s*/, " #{hyperlink_text} ").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\.{1}/, "#{hyperlink_text}.").gsub(/#{Regexp.escape(hyperlink_text)}\s{1}\,{1}/, "#{hyperlink_text},")
187
186
  end
188
187
 
189
188
  def redact_dates(txt)
190
- ConfidentialInfoRedactorLite::Date.new(string: txt, dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace.gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
189
+ ConfidentialInfoRedactorLite::Date.new(dow: dow, dow_abbr: dow_abbr, months: months, months_abbr: months_abbr).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
191
190
  end
192
191
 
193
192
  def redact_numbers(txt)