confidential_info_redactor_lite 0.0.34 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,27 +6,27 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
6
6
  context 'English (en)' do
7
7
  it 'extracts the proper nouns from a text #001' do
8
8
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
9
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Coca-Cola', 'Pepsi'])
9
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
10
10
  end
11
11
 
12
12
  it 'extracts the proper nouns from a text #002' do
13
13
  text = 'Coca-Cola announced a merger with Pepsi.'
14
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Coca-Cola', 'Pepsi'])
14
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
15
15
  end
16
16
 
17
17
  it 'extracts the proper nouns from a text #003' do
18
18
  text = 'Many employees of Deutsche Bank are looking for another job.'
19
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Deutsche Bank'])
19
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank'])
20
20
  end
21
21
 
22
22
  it 'extracts the proper nouns from a text #004' do
23
23
  text = 'Many employees of Deutsche Bank are looking for another job while those from Pepsi are not.'
24
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Deutsche Bank', 'Pepsi'])
24
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi'])
25
25
  end
26
26
 
27
27
  it 'extracts the proper nouns from a text #005' do
28
28
  text = 'There are many employees at Deutsche Bank. Some are thinking about drinking Pepsi, Coke, or Sprite.'
29
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Deutsche Bank', 'Pepsi', 'Coke', 'Sprite'])
29
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi', 'Coke', 'Sprite'])
30
30
  end
31
31
 
32
32
  it 'extracts the proper nouns from a text #006' do
@@ -87,99 +87,104 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
87
87
 
88
88
  Don’t forget to use your imagination and creativity!
89
89
  EOF
90
- expect(described_class.new(text: text, corpus: corpus).extract).to eq(["PGA", "iTunes", "YouTube", "Flickr", "Picasa", "Photobucket"])
90
+ expect(described_class.new(corpus: corpus).extract(text)).to eq(["PGA", "iTunes", "YouTube", "Flickr", "Picasa", "Photobucket"])
91
91
  end
92
92
 
93
93
  it 'extracts the proper nouns from a text #007' do
94
94
  text = 'I learned that Apple has plans to release a new iPhone, iPad and iWatch.'
95
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['Apple', 'iPhone', 'iPad', 'iWatch'])
95
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Apple', 'iPhone', 'iPad', 'iWatch'])
96
96
  end
97
97
 
98
98
  it 'extracts the proper nouns from a text #008' do
99
99
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
100
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Coca-Cola", "Pepsi", "John Smith"])
100
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Coca-Cola", "Pepsi", "John Smith"])
101
101
  end
102
102
 
103
103
  it 'extracts the proper nouns from a text #009' do
104
104
  text = 'Then Peter went to the store.'
105
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Peter"])
105
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Peter"])
106
106
  end
107
107
 
108
108
  it 'extracts the proper nouns from a text #010' do
109
109
  text = 'HOW TO COOK VEGETABLES'
110
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
110
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
111
111
  end
112
112
 
113
113
  it 'extracts the proper nouns from a text #011' do
114
114
  text = 'All Natural Peanut Butter'
115
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
115
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
116
116
  end
117
117
 
118
118
  it 'extracts the proper nouns from a text #012' do
119
119
  text = 'GOOD CARBS VS. BAD CARBS'
120
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
120
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
121
121
  end
122
122
 
123
123
  it 'extracts the proper nouns from a text #013' do
124
124
  text = 'Reducing”'
125
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
125
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
126
126
  end
127
127
 
128
128
  it 'extracts the proper nouns from a text #014' do
129
129
  text = '”'
130
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
130
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
131
131
  end
132
132
 
133
133
  it 'extracts the proper nouns from a text #015' do
134
134
  text = '“Reducing'
135
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq([])
135
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
136
136
  end
137
137
 
138
138
  it 'extracts the proper nouns from a text #016' do
139
139
  text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
140
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
140
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
141
141
  end
142
142
 
143
143
  it 'extracts the proper nouns from a text #017' do
144
144
  text = 'John'
145
- expect(described_class.new(text: text, corpus: corpus, language: 'en').extract).to eq(['John'])
145
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['John'])
146
+ end
147
+
148
+ it 'extracts the proper nouns from a text #018' do
149
+ text = 'John and Jane Doe'
150
+ expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["John", "Jane Doe"])
146
151
  end
147
152
  end
148
153
 
149
154
  context 'German (de)' do
150
155
  it 'extracts the proper nouns from a text #001' do
151
156
  text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
152
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutschen Bank'])
157
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutschen Bank'])
153
158
  end
154
159
 
155
160
  it 'extracts the proper nouns from a text #002' do
156
161
  text = 'Viele Mitarbeiter der Deutsche Bank suchen eine andere Arbeitsstelle.'
157
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
162
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
158
163
  end
159
164
 
160
165
  it 'extracts the proper nouns from a text #003' do
161
166
  text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
162
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
167
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
163
168
  end
164
169
 
165
170
  it 'extracts the proper nouns from a text #004' do
166
171
  text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
167
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
172
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
168
173
  end
169
174
 
170
175
  it 'extracts the proper nouns from a text #005' do
171
176
  text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
172
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
177
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
173
178
  end
174
179
 
175
180
  it 'extracts the proper nouns from a text #006' do
176
181
  text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
177
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
182
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
178
183
  end
179
184
 
180
185
  it 'extracts the proper nouns from a text #007' do
181
186
  text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
182
- expect(described_class.new(text: text, corpus: corpus, language: 'de').extract).to eq(['Deutsche Bank'])
187
+ expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
183
188
  end
184
189
  end
185
190
  end
@@ -4,14 +4,14 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
4
4
  context '#replace' do
5
5
  it 'replaces the hyperlinks in a string with regular tokens #001' do
6
6
  string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
7
- ws = described_class.new(string: string)
8
- expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
7
+ ws = described_class.new
8
+ expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
9
9
  end
10
10
 
11
11
  it 'replaces the hyperlinks in a string with regular tokens #002' do
12
12
  string = 'The file location is c:\Users\johndoe or d:\Users\john\www'
13
- ws = described_class.new(string: string)
14
- expect(ws.replace).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
13
+ ws = described_class.new
14
+ expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
15
15
  end
16
16
  end
17
17
  end
@@ -12,29 +12,35 @@ describe ConfidentialInfoRedactorLite do
12
12
  en_months = %w(january february march april may june july august september october november december)
13
13
  en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
14
14
  benchmark do
15
- extraction = ConfidentialInfoRedactorLite::Extractor.new(text: text, corpus: CORPUS).extract
16
- expect(extraction).to eq(["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"])
15
+ extraction = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
17
16
  end
18
17
  benchmark do
19
- ConfidentialInfoRedactorLite::Redactor.new(text: text, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates
18
+ ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
20
19
  end
21
20
  benchmark do
22
- ConfidentialInfoRedactorLite::Redactor.new(text: text, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers
21
+ text2 = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
22
+ ci = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS)
23
+ 100.times do
24
+ ci.extract(text2)
25
+ end
23
26
  end
24
27
  benchmark do
25
- ConfidentialInfoRedactorLite::Redactor.new(text: text, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails
28
+ ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)
26
29
  end
27
30
  benchmark do
28
- ConfidentialInfoRedactorLite::Redactor.new(text: text, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks
31
+ ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)
29
32
  end
30
33
  benchmark do
31
- ConfidentialInfoRedactorLite::Redactor.new(text: text, tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns
34
+ ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)
32
35
  end
33
36
  benchmark do
34
- ConfidentialInfoRedactorLite::Redactor.new(text: text, tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact
37
+ ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)
38
+ end
39
+ benchmark do
40
+ ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
35
41
  end
36
42
  # data = StackProf.run(mode: :cpu, interval: 1000) do
37
- # ConfidentialInfoRedactorLite::Extractor.new(text: text, corpus: CORPUS).extract
43
+ # ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
38
44
  # end
39
45
  # puts StackProf::Report.new(data).print_text
40
46
  end
@@ -46,7 +52,7 @@ describe ConfidentialInfoRedactorLite do
46
52
  en_months = %w(january february march april may june july august september october november december)
47
53
  en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
48
54
  benchmark do
49
- ConfidentialInfoRedactorLite::Redactor.new(text: text, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates
55
+ ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
50
56
  end
51
57
  end
52
58
  end
@@ -9,149 +9,149 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
9
9
 
10
10
  describe '#dates' do
11
11
  it 'handles nil as a text argument' do
12
- expect(described_class.new(text: nil, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates).to eq('')
12
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(nil)).to eq('')
13
13
  end
14
14
 
15
15
  it 'redacts dates from a text #001' do
16
16
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
17
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.')
17
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.')
18
18
  end
19
19
 
20
20
  it 'redacts dates from a text #002' do
21
21
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
22
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date>.')
22
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date>.')
23
23
  end
24
24
 
25
25
  it 'redacts dates from a text #003' do
26
26
  text = 'December 5, 2010 - Coca-Cola announced a merger with Pepsi.'
27
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates).to eq('<redacted date> - Coca-Cola announced a merger with Pepsi.')
27
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('<redacted date> - Coca-Cola announced a merger with Pepsi.')
28
28
  end
29
29
 
30
30
  it 'redacts dates from a text #004' do
31
31
  text = 'The scavenger hunt ends on Dec. 31st, 2011.'
32
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates).to eq('The scavenger hunt ends on <redacted date>.')
32
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('The scavenger hunt ends on <redacted date>.')
33
33
  end
34
34
 
35
35
  it 'handles nil date objects' do
36
36
  text = 'The scavenger hunt ends on Dec. 31st, 2011.'
37
- expect(described_class.new(text: text, language: 'en', dow: nil, dow_abbr: nil, months: nil, months_abbr: nil).dates).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
37
+ expect(described_class.new(language: 'en', dow: nil, dow_abbr: nil, months: nil, months_abbr: nil).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
38
38
  end
39
39
 
40
40
  it 'handles empty string date objects' do
41
41
  text = 'The scavenger hunt ends on Dec. 31st, 2011.'
42
- expect(described_class.new(text: text, language: 'en', dow: '', dow_abbr: '', months: '', months_abbr: '').dates).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
42
+ expect(described_class.new(language: 'en', dow: '', dow_abbr: '', months: '', months_abbr: '').dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
43
43
  end
44
44
 
45
45
  it 'handles empty array date objects' do
46
46
  text = 'The scavenger hunt ends on Dec. 31st, 2011.'
47
- expect(described_class.new(text: text, language: 'en', dow: [], dow_abbr: [], months: [], months_abbr: []).dates).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
47
+ expect(described_class.new(language: 'en', dow: [], dow_abbr: [], months: [], months_abbr: []).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
48
48
  end
49
49
  end
50
50
 
51
51
  describe '#dates_html' do
52
52
  it 'handles nil as a text argument' do
53
- expect(described_class.new(text: nil, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq([])
53
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(nil)).to eq([])
54
54
  end
55
55
 
56
56
  it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
57
57
  text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
58
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
58
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
59
59
  end
60
60
 
61
61
  it 'surrounds the redacted dates in spans and return the redacted dates from a text #002' do
62
62
  text = '2011年12月31日です。'
63
- expect(described_class.new(text: text, language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html).to eq(["<span class='confidentialDate'>*****</span> です。", ["2011年12月31日"]])
63
+ expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["<span class='confidentialDate'>*****</span> です。", ["2011年12月31日"]])
64
64
  end
65
65
  end
66
66
 
67
67
  describe '#numbers' do
68
68
  it 'handles nil as a text argument' do
69
- expect(described_class.new(text: nil, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('')
69
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(nil)).to eq('')
70
70
  end
71
71
 
72
72
  it 'redacts numbers from a text #001' do
73
73
  text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
74
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for <redacted number>.')
74
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for <redacted number>.')
75
75
  end
76
76
 
77
77
  it 'redacts numbers from a text #002' do
78
78
  text = '200 years ago.'
79
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('<redacted number> years ago.')
79
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number> years ago.')
80
80
  end
81
81
 
82
82
  it 'redacts numbers from a text #003' do
83
83
  text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick.'
84
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('It was his <redacted number> time, not yet his <redacted number>, not even his <redacted number>. The wood was <redacted number> thick.')
84
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('It was his <redacted number> time, not yet his <redacted number>, not even his <redacted number>. The wood was <redacted number> thick.')
85
85
  end
86
86
 
87
87
  it 'redacts numbers from a text #004' do
88
88
  text = 'Checking file of %2'
89
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('Checking file of <redacted number>')
89
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Checking file of <redacted number>')
90
90
  end
91
91
 
92
92
  it 'redacts numbers from a text #005' do
93
93
  text = 'zawiera pliki skompresowane (%2).'
94
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('zawiera pliki skompresowane (<redacted number>).')
94
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('zawiera pliki skompresowane (<redacted number>).')
95
95
  end
96
96
 
97
97
  it 'redacts numbers from a text #006' do
98
98
  text = '2134か24か0'
99
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq("<redacted number> か <redacted number> か <redacted number>")
99
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq("<redacted number> か <redacted number> か <redacted number>")
100
100
  end
101
101
 
102
102
  it 'redacts numbers from a text #007' do
103
103
  text = '100'
104
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers).to eq('<redacted number>')
104
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number>')
105
105
  end
106
106
  end
107
107
 
108
108
  describe '#numbers_html' do
109
109
  it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
110
110
  text = 'It was his 1st) time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
111
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["It was his <span class='confidentialNumber'>*****</span>) time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
111
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["It was his <span class='confidentialNumber'>*****</span>) time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
112
112
  end
113
113
 
114
114
  it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #002' do
115
115
  text = 'プロのミニチュアゴルファー2人のサイン。2人の出身国は別であること。(45ポイント;それぞれが別の大陸出身だった場合、5ボーナスポイント。)'
116
- expect(described_class.new(text: text, language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html).to eq(["プロのミニチュアゴルファー <span class='confidentialNumber'>*****</span> 人のサイン。 <span class='confidentialNumber'>*****</span> 人の出身国は別であること。( <span class='confidentialNumber'>*****</span> ポイント;それぞれが別の大陸出身だった場合、 <span class='confidentialNumber'>*****</span> ボーナスポイント。)", ["2", "2", "45", "5"]])
116
+ expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["プロのミニチュアゴルファー <span class='confidentialNumber'>*****</span> 人のサイン。 <span class='confidentialNumber'>*****</span> 人の出身国は別であること。( <span class='confidentialNumber'>*****</span> ポイント;それぞれが別の大陸出身だった場合、 <span class='confidentialNumber'>*****</span> ボーナスポイント。)", ["2", "2", "45", "5"]])
117
117
  end
118
118
  end
119
119
 
120
120
  describe '#emails' do
121
121
  it 'redacts email addresses from a text #001' do
122
122
  text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
123
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is <redacted email> or you can try <redacted email>.')
123
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is <redacted email> or you can try <redacted email>.')
124
124
  end
125
125
 
126
126
  it 'redacts email addresses from a text #002' do
127
127
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
128
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
128
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
129
129
  end
130
130
  end
131
131
 
132
132
  describe '#emails_html' do
133
133
  it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
134
134
  text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
135
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
135
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html(text)).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
136
136
  end
137
137
  end
138
138
 
139
139
  describe '#hyperlinks' do
140
140
  it 'redacts hyperlinks from a text #001' do
141
141
  text = 'Visit https://www.tm-town.com for more info.'
142
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
142
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
143
143
  end
144
144
 
145
145
  it 'redacts hyperlinks from a text #002' do
146
146
  text = 'Visit www.tm-town.com for more info.'
147
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks).to eq('Visit <redacted hyperlink> for more info.')
147
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
148
148
  end
149
149
  end
150
150
 
151
151
  describe '#hyperlinks_html' do
152
152
  it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
153
153
  text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
154
- expect(described_class.new(text: text, language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
154
+ expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html(text)).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
155
155
  end
156
156
  end
157
157
 
@@ -159,13 +159,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
159
159
  it 'redacts tokens from a text #001' do
160
160
  tokens = ['Coca-Cola', 'Pepsi']
161
161
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
162
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns).to eq('<redacted> announced a merger with <redacted> that will happen on on December 15th, 2020 for $200,000,000,000.')
162
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on December 15th, 2020 for $200,000,000,000.')
163
163
  end
164
164
 
165
165
  it 'redacts tokens from a text #002' do
166
166
  tokens = ['Coca-Cola', 'Pepsi']
167
167
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
168
- expect(described_class.new(text: text, language: 'en', tokens: tokens, token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns).to eq('***** announced a merger with ***** that will happen on on December 15th, 2020 for $200,000,000,000.')
168
+ expect(described_class.new(language: 'en', tokens: tokens, token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('***** announced a merger with ***** that will happen on on December 15th, 2020 for $200,000,000,000.')
169
169
  end
170
170
  end
171
171
 
@@ -173,7 +173,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
173
173
  it 'redacts all confidential information from a text #001' do
174
174
  tokens = ['Coca-Cola', 'Pepsi']
175
175
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
176
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on on <redacted date> for <redacted number>.')
176
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on <redacted date> for <redacted number>.')
177
177
  end
178
178
 
179
179
  it 'redacts all confidential information from a text #002' do
@@ -234,37 +234,37 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
234
234
 
235
235
  Don’t forget to use your imagination and creativity!
236
236
  EOF
237
- tokens = ConfidentialInfoRedactorLite::Extractor.new(text: text, corpus: corpus).extract
238
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq(" <redacted>\n\n <redacted> is hosting the <redacted number> <redacted>. So get out your putter and your camera and see if you have what it takes. Are you a <redacted>?\n\n <redacted>: <redacted number>) <redacted> of <redacted number> professional miniature golfers, each from a different country. (<redacted number> points; <redacted number> bonus points if the professional miniature golfers are also from <redacted number> different continents) <redacted number>) <redacted> of yourself next to each obstacle in our list of the Top <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points for each obstacle that exactly matches the one pictured in the article) <redacted number>) <redacted> your own full-size miniature golf hole. (<redacted number> points; up to <redacted number> bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) <redacted number>) <redacted> of yourself making a hole-in-one on two consecutive miniature golf holes. <redacted> video must be one continuous shot with no editing. (<redacted number> points) <redacted number>) <redacted> of yourself with the <redacted> mascot. (<redacted number> points; <redacted number> bonus points if you are wearing a <redacted> t-shirt) <redacted number>) <redacted> of yourself with the completed <redacted> wobblehead. (<redacted number> points; <redacted number> bonus points if the picture is taken at a miniature golf course) <redacted number>) <redacted> of a completed scorecard from a round of miniature golf. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> of completed scorecards from <redacted number> different miniature golf courses. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> an entry to the <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points if your entry gets more than <redacted number> votes) <redacted number>) <redacted> from the <redacted> app showing a 9-hole score below par. (<redacted number> points) <redacted number>) <redacted> from the <redacted> app showing that you have successfully unlocked all of the holes in the game. (<redacted number> points) <redacted number>) <redacted> of the <redacted> wobblehead at a <redacted>. (<redacted number> points) <redacted number>) <redacted> and submit the <redacted> ‘Practice <redacted>’ and ‘Final <redacted>’ for any one of the <redacted> math or physics lessons. (<redacted number> points; <redacted number> bonus points if you complete two lessons) <redacted number>) <redacted> of yourself with at least <redacted number> different colored miniature golf balls. (<redacted number> points; <redacted number> bonus points for each additional color {limit of <redacted number> bonus points}) <redacted number>) <redacted> of yourself with a famous golfer or miniature golfer. (<redacted number> points; <redacted number> bonus points if the golfer is on the <redacted> tour <redacted> you are wearing a <redacted> t-shirt in the picture) <redacted number>) <redacted> of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (<redacted number> points) <redacted number>) <redacted> of yourself successfully making a trick miniature golf shot. (<redacted number> points; up to <redacted number> bonus points available depending on the difficulty and complexity of the trick shot)\n\n\n Prizes: <redacted number> <redacted> <redacted>\n\n <redacted>\n (<redacted number> <redacted number> <redacted> - <redacted>)\n\n <redacted> team will judge the scavenger hunt and all decisions will be final. <redacted> is sponsoring it. <redacted> scavenger hunt is open to anyone and everyone. <redacted> scavenger hunt ends on <redacted date>.\n\n <redacted> enter the scavenger hunt, send an email to info <redacted> putterking <redacted> com with the subject line: \"<redacted>\". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as <redacted>, <redacted>, <redacted>, <redacted>, etc. for your submissions.\n\n <redacted> entering the <redacted>, you allow <redacted> to use or link to any of the pictures or videos you submit for advertisements and promotions.\n\n Don’t forget to use your imagination and creativity!\n")
237
+ tokens = ConfidentialInfoRedactorLite::Extractor.new(corpus: corpus).extract(text)
238
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq(" <redacted>\n\n <redacted> is hosting the <redacted number> <redacted>. So get out your putter and your camera and see if you have what it takes. Are you a <redacted>?\n\n <redacted>: <redacted number>) <redacted> of <redacted number> professional miniature golfers, each from a different country. (<redacted number> points; <redacted number> bonus points if the professional miniature golfers are also from <redacted number> different continents) <redacted number>) <redacted> of yourself next to each obstacle in our list of the Top <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points for each obstacle that exactly matches the one pictured in the article) <redacted number>) <redacted> your own full-size miniature golf hole. (<redacted number> points; up to <redacted number> bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) <redacted number>) <redacted> of yourself making a hole-in-one on two consecutive miniature golf holes. <redacted> video must be one continuous shot with no editing. (<redacted number> points) <redacted number>) <redacted> of yourself with the <redacted> mascot. (<redacted number> points; <redacted number> bonus points if you are wearing a <redacted> t-shirt) <redacted number>) <redacted> of yourself with the completed <redacted> wobblehead. (<redacted number> points; <redacted number> bonus points if the picture is taken at a miniature golf course) <redacted number>) <redacted> of a completed scorecard from a round of miniature golf. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> of completed scorecards from <redacted number> different miniature golf courses. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> an entry to the <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points if your entry gets more than <redacted number> votes) <redacted number>) <redacted> from the <redacted> app showing a 9-hole score below par. (<redacted number> points) <redacted number>) <redacted> from the <redacted> app showing that you have successfully unlocked all of the holes in the game. (<redacted number> points) <redacted number>) <redacted> of the <redacted> wobblehead at a <redacted>. (<redacted number> points) <redacted number>) <redacted> and submit the <redacted> ‘Practice <redacted>’ and ‘Final <redacted>’ for any one of the <redacted> math or physics lessons. (<redacted number> points; <redacted number> bonus points if you complete two lessons) <redacted number>) <redacted> of yourself with at least <redacted number> different colored miniature golf balls. (<redacted number> points; <redacted number> bonus points for each additional color {limit of <redacted number> bonus points}) <redacted number>) <redacted> of yourself with a famous golfer or miniature golfer. (<redacted number> points; <redacted number> bonus points if the golfer is on the <redacted> tour <redacted> you are wearing a <redacted> t-shirt in the picture) <redacted number>) <redacted> of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (<redacted number> points) <redacted number>) <redacted> of yourself successfully making a trick miniature golf shot. (<redacted number> points; up to <redacted number> bonus points available depending on the difficulty and complexity of the trick shot)\n\n\n Prizes: <redacted number> <redacted> <redacted>\n\n <redacted>\n (<redacted number> <redacted number> <redacted> - <redacted>)\n\n <redacted> team will judge the scavenger hunt and all decisions will be final. <redacted> is sponsoring it. <redacted> scavenger hunt is open to anyone and everyone. <redacted> scavenger hunt ends on <redacted date>.\n\n <redacted> enter the scavenger hunt, send an email to info <redacted> putterking <redacted> com with the subject line: \"<redacted>\". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as <redacted>, <redacted>, <redacted>, <redacted>, etc. for your submissions.\n\n <redacted> entering the <redacted>, you allow <redacted> to use or link to any of the pictures or videos you submit for advertisements and promotions.\n\n Don’t forget to use your imagination and creativity!\n")
239
239
  end
240
240
 
241
241
  it 'redacts all confidential information from a text #003' do
242
242
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
243
243
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
244
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
244
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
245
245
  end
246
246
 
247
247
  it 'redacts all confidential information from a text #004' do
248
248
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
249
249
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
250
- expect(described_class.new(text: text, language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
250
+ expect(described_class.new(language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
251
251
  end
252
252
 
253
253
  it 'redacts all confidential information from a text #005' do
254
254
  tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
255
255
  text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
256
- expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
256
+ expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
257
257
  end
258
258
 
259
259
  it 'redacts all confidential information from a text #006' do
260
260
  tokens = ['Trans']
261
261
  text = 'My Transformation - avoid Trans.'
262
- expect(described_class.new(text: text, language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact).to eq('My Transformation - avoid *****.')
262
+ expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('My Transformation - avoid *****.')
263
263
  end
264
264
 
265
265
  it 'redacts all confidential information from a text #007' do
266
266
  text = 'これはjohn@gmail.comかk.light@tuv.eu.usかhttps://www.tm-town.comです.'
267
- expect(described_class.new(text: text, language: 'ja', tokens: nil, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks_html[1]).to eq(["https://www.tm-town.com"])
267
+ expect(described_class.new(language: 'ja', tokens: nil, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks_html(text)[1]).to eq(["https://www.tm-town.com"])
268
268
  end
269
269
  end
270
270
 
@@ -272,25 +272,25 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
272
272
  it 'redacts all confidential information from a text #001' do
273
273
  tokens = ['Coca-Cola', 'Pepsi']
274
274
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
275
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
275
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html(text)).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
276
276
  end
277
277
 
278
278
  it 'redacts all confidential information from a text #002' do
279
279
  tokens = ['Coca-Cola', 'Pepsi']
280
280
  text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
281
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
281
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
282
282
  end
283
283
 
284
284
  it 'redacts all confidential information from a text #003' do
285
285
  tokens = ['CLA']
286
286
  text = 'LEGAL DISCLAIMER - CLA will not be held reponsible for changes.'
287
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact).to eq("LEGAL DISCLAIMER - ***** will not be held reponsible for changes.")
287
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("LEGAL DISCLAIMER - ***** will not be held reponsible for changes.")
288
288
  end
289
289
 
290
290
  it 'redacts all confidential information from a text #004' do
291
291
  tokens = []
292
292
  text = '1984 was a good year.'
293
- expect(described_class.new(text: text, language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact_html).to eq("<span class='confidentialNumber'>**number**</span> was a good year.")
293
+ expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact_html(text)).to eq("<span class='confidentialNumber'>**number**</span> was a good year.")
294
294
  end
295
295
  end
296
296
  end