confidential_info_redactor_lite 0.0.34 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/date.rb +89 -115
- data/lib/confidential_info_redactor_lite/extractor.rb +44 -27
- data/lib/confidential_info_redactor_lite/hyperlink.rb +3 -11
- data/lib/confidential_info_redactor_lite/redactor.rb +15 -16
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/date_spec.rb +184 -184
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +29 -24
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +4 -4
- data/spec/confidential_info_redactor_lite/performance_spec.rb +16 -10
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +41 -41
- metadata +2 -2
@@ -6,27 +6,27 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
6
6
|
context 'English (en)' do
|
7
7
|
it 'extracts the proper nouns from a text #001' do
|
8
8
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
|
9
|
-
expect(described_class.new(
|
9
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
|
10
10
|
end
|
11
11
|
|
12
12
|
it 'extracts the proper nouns from a text #002' do
|
13
13
|
text = 'Coca-Cola announced a merger with Pepsi.'
|
14
|
-
expect(described_class.new(
|
14
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
|
15
15
|
end
|
16
16
|
|
17
17
|
it 'extracts the proper nouns from a text #003' do
|
18
18
|
text = 'Many employees of Deutsche Bank are looking for another job.'
|
19
|
-
expect(described_class.new(
|
19
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank'])
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'extracts the proper nouns from a text #004' do
|
23
23
|
text = 'Many employees of Deutsche Bank are looking for another job while those from Pepsi are not.'
|
24
|
-
expect(described_class.new(
|
24
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi'])
|
25
25
|
end
|
26
26
|
|
27
27
|
it 'extracts the proper nouns from a text #005' do
|
28
28
|
text = 'There are many employees at Deutsche Bank. Some are thinking about drinking Pepsi, Coke, or Sprite.'
|
29
|
-
expect(described_class.new(
|
29
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi', 'Coke', 'Sprite'])
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'extracts the proper nouns from a text #006' do
|
@@ -87,99 +87,104 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
87
87
|
|
88
88
|
Don’t forget to use your imagination and creativity!
|
89
89
|
EOF
|
90
|
-
expect(described_class.new(
|
90
|
+
expect(described_class.new(corpus: corpus).extract(text)).to eq(["PGA", "iTunes", "YouTube", "Flickr", "Picasa", "Photobucket"])
|
91
91
|
end
|
92
92
|
|
93
93
|
it 'extracts the proper nouns from a text #007' do
|
94
94
|
text = 'I learned that Apple has plans to release a new iPhone, iPad and iWatch.'
|
95
|
-
expect(described_class.new(
|
95
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Apple', 'iPhone', 'iPad', 'iWatch'])
|
96
96
|
end
|
97
97
|
|
98
98
|
it 'extracts the proper nouns from a text #008' do
|
99
99
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
100
|
-
expect(described_class.new(
|
100
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Coca-Cola", "Pepsi", "John Smith"])
|
101
101
|
end
|
102
102
|
|
103
103
|
it 'extracts the proper nouns from a text #009' do
|
104
104
|
text = 'Then Peter went to the store.'
|
105
|
-
expect(described_class.new(
|
105
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Peter"])
|
106
106
|
end
|
107
107
|
|
108
108
|
it 'extracts the proper nouns from a text #010' do
|
109
109
|
text = 'HOW TO COOK VEGETABLES'
|
110
|
-
expect(described_class.new(
|
110
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
111
111
|
end
|
112
112
|
|
113
113
|
it 'extracts the proper nouns from a text #011' do
|
114
114
|
text = 'All Natural Peanut Butter'
|
115
|
-
expect(described_class.new(
|
115
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
116
116
|
end
|
117
117
|
|
118
118
|
it 'extracts the proper nouns from a text #012' do
|
119
119
|
text = 'GOOD CARBS VS. BAD CARBS'
|
120
|
-
expect(described_class.new(
|
120
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
121
121
|
end
|
122
122
|
|
123
123
|
it 'extracts the proper nouns from a text #013' do
|
124
124
|
text = 'Reducing”'
|
125
|
-
expect(described_class.new(
|
125
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
126
126
|
end
|
127
127
|
|
128
128
|
it 'extracts the proper nouns from a text #014' do
|
129
129
|
text = '”'
|
130
|
-
expect(described_class.new(
|
130
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
131
131
|
end
|
132
132
|
|
133
133
|
it 'extracts the proper nouns from a text #015' do
|
134
134
|
text = '“Reducing'
|
135
|
-
expect(described_class.new(
|
135
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
136
136
|
end
|
137
137
|
|
138
138
|
it 'extracts the proper nouns from a text #016' do
|
139
139
|
text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
|
140
|
-
expect(described_class.new(
|
140
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
|
141
141
|
end
|
142
142
|
|
143
143
|
it 'extracts the proper nouns from a text #017' do
|
144
144
|
text = 'John'
|
145
|
-
expect(described_class.new(
|
145
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['John'])
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'extracts the proper nouns from a text #018' do
|
149
|
+
text = 'John and Jane Doe'
|
150
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["John", "Jane Doe"])
|
146
151
|
end
|
147
152
|
end
|
148
153
|
|
149
154
|
context 'German (de)' do
|
150
155
|
it 'extracts the proper nouns from a text #001' do
|
151
156
|
text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
|
152
|
-
expect(described_class.new(
|
157
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutschen Bank'])
|
153
158
|
end
|
154
159
|
|
155
160
|
it 'extracts the proper nouns from a text #002' do
|
156
161
|
text = 'Viele Mitarbeiter der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
157
|
-
expect(described_class.new(
|
162
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
158
163
|
end
|
159
164
|
|
160
165
|
it 'extracts the proper nouns from a text #003' do
|
161
166
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
162
|
-
expect(described_class.new(
|
167
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'extracts the proper nouns from a text #004' do
|
166
171
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
167
|
-
expect(described_class.new(
|
172
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
168
173
|
end
|
169
174
|
|
170
175
|
it 'extracts the proper nouns from a text #005' do
|
171
176
|
text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
|
172
|
-
expect(described_class.new(
|
177
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
173
178
|
end
|
174
179
|
|
175
180
|
it 'extracts the proper nouns from a text #006' do
|
176
181
|
text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
|
177
|
-
expect(described_class.new(
|
182
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
178
183
|
end
|
179
184
|
|
180
185
|
it 'extracts the proper nouns from a text #007' do
|
181
186
|
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
182
|
-
expect(described_class.new(
|
187
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
183
188
|
end
|
184
189
|
end
|
185
190
|
end
|
@@ -4,14 +4,14 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
|
4
4
|
context '#replace' do
|
5
5
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
6
6
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
7
|
-
ws = described_class.new
|
8
|
-
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
7
|
+
ws = described_class.new
|
8
|
+
expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'replaces the hyperlinks in a string with regular tokens #002' do
|
12
12
|
string = 'The file location is c:\Users\johndoe or d:\Users\john\www'
|
13
|
-
ws = described_class.new
|
14
|
-
expect(ws.replace).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
|
13
|
+
ws = described_class.new
|
14
|
+
expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -12,29 +12,35 @@ describe ConfidentialInfoRedactorLite do
|
|
12
12
|
en_months = %w(january february march april may june july august september october november december)
|
13
13
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
14
14
|
benchmark do
|
15
|
-
extraction = ConfidentialInfoRedactorLite::Extractor.new(
|
16
|
-
expect(extraction).to eq(["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"])
|
15
|
+
extraction = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
|
17
16
|
end
|
18
17
|
benchmark do
|
19
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
18
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
20
19
|
end
|
21
20
|
benchmark do
|
22
|
-
|
21
|
+
text2 = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
22
|
+
ci = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS)
|
23
|
+
100.times do
|
24
|
+
ci.extract(text2)
|
25
|
+
end
|
23
26
|
end
|
24
27
|
benchmark do
|
25
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
28
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)
|
26
29
|
end
|
27
30
|
benchmark do
|
28
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
31
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)
|
29
32
|
end
|
30
33
|
benchmark do
|
31
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
34
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)
|
32
35
|
end
|
33
36
|
benchmark do
|
34
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
37
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)
|
38
|
+
end
|
39
|
+
benchmark do
|
40
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
35
41
|
end
|
36
42
|
# data = StackProf.run(mode: :cpu, interval: 1000) do
|
37
|
-
# ConfidentialInfoRedactorLite::Extractor.new(
|
43
|
+
# ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
|
38
44
|
# end
|
39
45
|
# puts StackProf::Report.new(data).print_text
|
40
46
|
end
|
@@ -46,7 +52,7 @@ describe ConfidentialInfoRedactorLite do
|
|
46
52
|
en_months = %w(january february march april may june july august september october november december)
|
47
53
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
48
54
|
benchmark do
|
49
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
55
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
50
56
|
end
|
51
57
|
end
|
52
58
|
end
|
@@ -9,149 +9,149 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
9
9
|
|
10
10
|
describe '#dates' do
|
11
11
|
it 'handles nil as a text argument' do
|
12
|
-
expect(described_class.new(
|
12
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(nil)).to eq('')
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'redacts dates from a text #001' do
|
16
16
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
|
17
|
-
expect(described_class.new(
|
17
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.')
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'redacts dates from a text #002' do
|
21
21
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
22
|
-
expect(described_class.new(
|
22
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date>.')
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'redacts dates from a text #003' do
|
26
26
|
text = 'December 5, 2010 - Coca-Cola announced a merger with Pepsi.'
|
27
|
-
expect(described_class.new(
|
27
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('<redacted date> - Coca-Cola announced a merger with Pepsi.')
|
28
28
|
end
|
29
29
|
|
30
30
|
it 'redacts dates from a text #004' do
|
31
31
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
32
|
-
expect(described_class.new(
|
32
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('The scavenger hunt ends on <redacted date>.')
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'handles nil date objects' do
|
36
36
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
37
|
-
expect(described_class.new(
|
37
|
+
expect(described_class.new(language: 'en', dow: nil, dow_abbr: nil, months: nil, months_abbr: nil).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
38
38
|
end
|
39
39
|
|
40
40
|
it 'handles empty string date objects' do
|
41
41
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
42
|
-
expect(described_class.new(
|
42
|
+
expect(described_class.new(language: 'en', dow: '', dow_abbr: '', months: '', months_abbr: '').dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
43
43
|
end
|
44
44
|
|
45
45
|
it 'handles empty array date objects' do
|
46
46
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
47
|
-
expect(described_class.new(
|
47
|
+
expect(described_class.new(language: 'en', dow: [], dow_abbr: [], months: [], months_abbr: []).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
51
|
describe '#dates_html' do
|
52
52
|
it 'handles nil as a text argument' do
|
53
|
-
expect(described_class.new(
|
53
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(nil)).to eq([])
|
54
54
|
end
|
55
55
|
|
56
56
|
it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
|
57
57
|
text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
58
|
-
expect(described_class.new(
|
58
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
|
59
59
|
end
|
60
60
|
|
61
61
|
it 'surrounds the redacted dates in spans and return the redacted dates from a text #002' do
|
62
62
|
text = '2011年12月31日です。'
|
63
|
-
expect(described_class.new(
|
63
|
+
expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["<span class='confidentialDate'>*****</span> です。", ["2011年12月31日"]])
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
67
|
describe '#numbers' do
|
68
68
|
it 'handles nil as a text argument' do
|
69
|
-
expect(described_class.new(
|
69
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(nil)).to eq('')
|
70
70
|
end
|
71
71
|
|
72
72
|
it 'redacts numbers from a text #001' do
|
73
73
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
|
74
|
-
expect(described_class.new(
|
74
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for <redacted number>.')
|
75
75
|
end
|
76
76
|
|
77
77
|
it 'redacts numbers from a text #002' do
|
78
78
|
text = '200 years ago.'
|
79
|
-
expect(described_class.new(
|
79
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number> years ago.')
|
80
80
|
end
|
81
81
|
|
82
82
|
it 'redacts numbers from a text #003' do
|
83
83
|
text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick.'
|
84
|
-
expect(described_class.new(
|
84
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('It was his <redacted number> time, not yet his <redacted number>, not even his <redacted number>. The wood was <redacted number> thick.')
|
85
85
|
end
|
86
86
|
|
87
87
|
it 'redacts numbers from a text #004' do
|
88
88
|
text = 'Checking file of %2'
|
89
|
-
expect(described_class.new(
|
89
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Checking file of <redacted number>')
|
90
90
|
end
|
91
91
|
|
92
92
|
it 'redacts numbers from a text #005' do
|
93
93
|
text = 'zawiera pliki skompresowane (%2).'
|
94
|
-
expect(described_class.new(
|
94
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('zawiera pliki skompresowane (<redacted number>).')
|
95
95
|
end
|
96
96
|
|
97
97
|
it 'redacts numbers from a text #006' do
|
98
98
|
text = '2134か24か0'
|
99
|
-
expect(described_class.new(
|
99
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq("<redacted number> か <redacted number> か <redacted number>")
|
100
100
|
end
|
101
101
|
|
102
102
|
it 'redacts numbers from a text #007' do
|
103
103
|
text = '100'
|
104
|
-
expect(described_class.new(
|
104
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number>')
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
108
|
describe '#numbers_html' do
|
109
109
|
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
|
110
110
|
text = 'It was his 1st) time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
|
111
|
-
expect(described_class.new(
|
111
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["It was his <span class='confidentialNumber'>*****</span>) time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
|
112
112
|
end
|
113
113
|
|
114
114
|
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #002' do
|
115
115
|
text = 'プロのミニチュアゴルファー2人のサイン。2人の出身国は別であること。(45ポイント;それぞれが別の大陸出身だった場合、5ボーナスポイント。)'
|
116
|
-
expect(described_class.new(
|
116
|
+
expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["プロのミニチュアゴルファー <span class='confidentialNumber'>*****</span> 人のサイン。 <span class='confidentialNumber'>*****</span> 人の出身国は別であること。( <span class='confidentialNumber'>*****</span> ポイント;それぞれが別の大陸出身だった場合、 <span class='confidentialNumber'>*****</span> ボーナスポイント。)", ["2", "2", "45", "5"]])
|
117
117
|
end
|
118
118
|
end
|
119
119
|
|
120
120
|
describe '#emails' do
|
121
121
|
it 'redacts email addresses from a text #001' do
|
122
122
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
123
|
-
expect(described_class.new(
|
123
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is <redacted email> or you can try <redacted email>.')
|
124
124
|
end
|
125
125
|
|
126
126
|
it 'redacts email addresses from a text #002' do
|
127
127
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
128
|
-
expect(described_class.new(
|
128
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
|
129
129
|
end
|
130
130
|
end
|
131
131
|
|
132
132
|
describe '#emails_html' do
|
133
133
|
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
134
134
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
135
|
-
expect(described_class.new(
|
135
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html(text)).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
136
136
|
end
|
137
137
|
end
|
138
138
|
|
139
139
|
describe '#hyperlinks' do
|
140
140
|
it 'redacts hyperlinks from a text #001' do
|
141
141
|
text = 'Visit https://www.tm-town.com for more info.'
|
142
|
-
expect(described_class.new(
|
142
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
|
143
143
|
end
|
144
144
|
|
145
145
|
it 'redacts hyperlinks from a text #002' do
|
146
146
|
text = 'Visit www.tm-town.com for more info.'
|
147
|
-
expect(described_class.new(
|
147
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
|
148
148
|
end
|
149
149
|
end
|
150
150
|
|
151
151
|
describe '#hyperlinks_html' do
|
152
152
|
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
153
153
|
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
154
|
-
expect(described_class.new(
|
154
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html(text)).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
155
155
|
end
|
156
156
|
end
|
157
157
|
|
@@ -159,13 +159,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
159
159
|
it 'redacts tokens from a text #001' do
|
160
160
|
tokens = ['Coca-Cola', 'Pepsi']
|
161
161
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
162
|
-
expect(described_class.new(
|
162
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on December 15th, 2020 for $200,000,000,000.')
|
163
163
|
end
|
164
164
|
|
165
165
|
it 'redacts tokens from a text #002' do
|
166
166
|
tokens = ['Coca-Cola', 'Pepsi']
|
167
167
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
168
|
-
expect(described_class.new(
|
168
|
+
expect(described_class.new(language: 'en', tokens: tokens, token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('***** announced a merger with ***** that will happen on on December 15th, 2020 for $200,000,000,000.')
|
169
169
|
end
|
170
170
|
end
|
171
171
|
|
@@ -173,7 +173,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
173
173
|
it 'redacts all confidential information from a text #001' do
|
174
174
|
tokens = ['Coca-Cola', 'Pepsi']
|
175
175
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
176
|
-
expect(described_class.new(
|
176
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on <redacted date> for <redacted number>.')
|
177
177
|
end
|
178
178
|
|
179
179
|
it 'redacts all confidential information from a text #002' do
|
@@ -234,37 +234,37 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
234
234
|
|
235
235
|
Don’t forget to use your imagination and creativity!
|
236
236
|
EOF
|
237
|
-
tokens = ConfidentialInfoRedactorLite::Extractor.new(
|
238
|
-
expect(described_class.new(
|
237
|
+
tokens = ConfidentialInfoRedactorLite::Extractor.new(corpus: corpus).extract(text)
|
238
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq(" <redacted>\n\n <redacted> is hosting the <redacted number> <redacted>. So get out your putter and your camera and see if you have what it takes. Are you a <redacted>?\n\n <redacted>: <redacted number>) <redacted> of <redacted number> professional miniature golfers, each from a different country. (<redacted number> points; <redacted number> bonus points if the professional miniature golfers are also from <redacted number> different continents) <redacted number>) <redacted> of yourself next to each obstacle in our list of the Top <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points for each obstacle that exactly matches the one pictured in the article) <redacted number>) <redacted> your own full-size miniature golf hole. (<redacted number> points; up to <redacted number> bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) <redacted number>) <redacted> of yourself making a hole-in-one on two consecutive miniature golf holes. <redacted> video must be one continuous shot with no editing. (<redacted number> points) <redacted number>) <redacted> of yourself with the <redacted> mascot. (<redacted number> points; <redacted number> bonus points if you are wearing a <redacted> t-shirt) <redacted number>) <redacted> of yourself with the completed <redacted> wobblehead. (<redacted number> points; <redacted number> bonus points if the picture is taken at a miniature golf course) <redacted number>) <redacted> of a completed scorecard from a round of miniature golf. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> of completed scorecards from <redacted number> different miniature golf courses. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> an entry to the <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points if your entry gets more than <redacted number> votes) <redacted number>) <redacted> from the <redacted> app showing a 9-hole score below par. (<redacted number> points) <redacted number>) <redacted> from the <redacted> app showing that you have successfully unlocked all of the holes in the game. (<redacted number> points) <redacted number>) <redacted> of the <redacted> wobblehead at a <redacted>. (<redacted number> points) <redacted number>) <redacted> and submit the <redacted> ‘Practice <redacted>’ and ‘Final <redacted>’ for any one of the <redacted> math or physics lessons. (<redacted number> points; <redacted number> bonus points if you complete two lessons) <redacted number>) <redacted> of yourself with at least <redacted number> different colored miniature golf balls. (<redacted number> points; <redacted number> bonus points for each additional color {limit of <redacted number> bonus points}) <redacted number>) <redacted> of yourself with a famous golfer or miniature golfer. (<redacted number> points; <redacted number> bonus points if the golfer is on the <redacted> tour <redacted> you are wearing a <redacted> t-shirt in the picture) <redacted number>) <redacted> of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (<redacted number> points) <redacted number>) <redacted> of yourself successfully making a trick miniature golf shot. (<redacted number> points; up to <redacted number> bonus points available depending on the difficulty and complexity of the trick shot)\n\n\n Prizes: <redacted number> <redacted> <redacted>\n\n <redacted>\n (<redacted number> <redacted number> <redacted> - <redacted>)\n\n <redacted> team will judge the scavenger hunt and all decisions will be final. <redacted> is sponsoring it. <redacted> scavenger hunt is open to anyone and everyone. <redacted> scavenger hunt ends on <redacted date>.\n\n <redacted> enter the scavenger hunt, send an email to info <redacted> putterking <redacted> com with the subject line: \"<redacted>\". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as <redacted>, <redacted>, <redacted>, <redacted>, etc. for your submissions.\n\n <redacted> entering the <redacted>, you allow <redacted> to use or link to any of the pictures or videos you submit for advertisements and promotions.\n\n Don’t forget to use your imagination and creativity!\n")
|
239
239
|
end
|
240
240
|
|
241
241
|
it 'redacts all confidential information from a text #003' do
|
242
242
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
243
243
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
244
|
-
expect(described_class.new(
|
244
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
245
245
|
end
|
246
246
|
|
247
247
|
it 'redacts all confidential information from a text #004' do
|
248
248
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
249
249
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
250
|
-
expect(described_class.new(
|
250
|
+
expect(described_class.new(language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
251
251
|
end
|
252
252
|
|
253
253
|
it 'redacts all confidential information from a text #005' do
|
254
254
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
255
255
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
256
|
-
expect(described_class.new(
|
256
|
+
expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
257
257
|
end
|
258
258
|
|
259
259
|
it 'redacts all confidential information from a text #006' do
|
260
260
|
tokens = ['Trans']
|
261
261
|
text = 'My Transformation - avoid Trans.'
|
262
|
-
expect(described_class.new(
|
262
|
+
expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('My Transformation - avoid *****.')
|
263
263
|
end
|
264
264
|
|
265
265
|
it 'redacts all confidential information from a text #007' do
|
266
266
|
text = 'これはjohn@gmail.comかk.light@tuv.eu.usかhttps://www.tm-town.comです.'
|
267
|
-
expect(described_class.new(
|
267
|
+
expect(described_class.new(language: 'ja', tokens: nil, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks_html(text)[1]).to eq(["https://www.tm-town.com"])
|
268
268
|
end
|
269
269
|
end
|
270
270
|
|
@@ -272,25 +272,25 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
272
272
|
it 'redacts all confidential information from a text #001' do
|
273
273
|
tokens = ['Coca-Cola', 'Pepsi']
|
274
274
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
275
|
-
expect(described_class.new(
|
275
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html(text)).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
276
276
|
end
|
277
277
|
|
278
278
|
it 'redacts all confidential information from a text #002' do
|
279
279
|
tokens = ['Coca-Cola', 'Pepsi']
|
280
280
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
281
|
-
expect(described_class.new(
|
281
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
|
282
282
|
end
|
283
283
|
|
284
284
|
it 'redacts all confidential information from a text #003' do
|
285
285
|
tokens = ['CLA']
|
286
286
|
text = 'LEGAL DISCLAIMER - CLA will not be held reponsible for changes.'
|
287
|
-
expect(described_class.new(
|
287
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("LEGAL DISCLAIMER - ***** will not be held reponsible for changes.")
|
288
288
|
end
|
289
289
|
|
290
290
|
it 'redacts all confidential information from a text #004' do
|
291
291
|
tokens = []
|
292
292
|
text = '1984 was a good year.'
|
293
|
-
expect(described_class.new(
|
293
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact_html(text)).to eq("<span class='confidentialNumber'>**number**</span> was a good year.")
|
294
294
|
end
|
295
295
|
end
|
296
296
|
end
|