confidential_info_redactor_lite 0.0.34 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/confidential_info_redactor_lite/date.rb +89 -115
- data/lib/confidential_info_redactor_lite/extractor.rb +44 -27
- data/lib/confidential_info_redactor_lite/hyperlink.rb +3 -11
- data/lib/confidential_info_redactor_lite/redactor.rb +15 -16
- data/lib/confidential_info_redactor_lite/version.rb +1 -1
- data/spec/confidential_info_redactor_lite/date_spec.rb +184 -184
- data/spec/confidential_info_redactor_lite/extractor_spec.rb +29 -24
- data/spec/confidential_info_redactor_lite/hyperlink_spec.rb +4 -4
- data/spec/confidential_info_redactor_lite/performance_spec.rb +16 -10
- data/spec/confidential_info_redactor_lite/redactor_spec.rb +41 -41
- metadata +2 -2
@@ -6,27 +6,27 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
6
6
|
context 'English (en)' do
|
7
7
|
it 'extracts the proper nouns from a text #001' do
|
8
8
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
|
9
|
-
expect(described_class.new(
|
9
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
|
10
10
|
end
|
11
11
|
|
12
12
|
it 'extracts the proper nouns from a text #002' do
|
13
13
|
text = 'Coca-Cola announced a merger with Pepsi.'
|
14
|
-
expect(described_class.new(
|
14
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi'])
|
15
15
|
end
|
16
16
|
|
17
17
|
it 'extracts the proper nouns from a text #003' do
|
18
18
|
text = 'Many employees of Deutsche Bank are looking for another job.'
|
19
|
-
expect(described_class.new(
|
19
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank'])
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'extracts the proper nouns from a text #004' do
|
23
23
|
text = 'Many employees of Deutsche Bank are looking for another job while those from Pepsi are not.'
|
24
|
-
expect(described_class.new(
|
24
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi'])
|
25
25
|
end
|
26
26
|
|
27
27
|
it 'extracts the proper nouns from a text #005' do
|
28
28
|
text = 'There are many employees at Deutsche Bank. Some are thinking about drinking Pepsi, Coke, or Sprite.'
|
29
|
-
expect(described_class.new(
|
29
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi', 'Coke', 'Sprite'])
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'extracts the proper nouns from a text #006' do
|
@@ -87,99 +87,104 @@ RSpec.describe ConfidentialInfoRedactorLite::Extractor do
|
|
87
87
|
|
88
88
|
Don’t forget to use your imagination and creativity!
|
89
89
|
EOF
|
90
|
-
expect(described_class.new(
|
90
|
+
expect(described_class.new(corpus: corpus).extract(text)).to eq(["PGA", "iTunes", "YouTube", "Flickr", "Picasa", "Photobucket"])
|
91
91
|
end
|
92
92
|
|
93
93
|
it 'extracts the proper nouns from a text #007' do
|
94
94
|
text = 'I learned that Apple has plans to release a new iPhone, iPad and iWatch.'
|
95
|
-
expect(described_class.new(
|
95
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['Apple', 'iPhone', 'iPad', 'iWatch'])
|
96
96
|
end
|
97
97
|
|
98
98
|
it 'extracts the proper nouns from a text #008' do
|
99
99
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
100
|
-
expect(described_class.new(
|
100
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Coca-Cola", "Pepsi", "John Smith"])
|
101
101
|
end
|
102
102
|
|
103
103
|
it 'extracts the proper nouns from a text #009' do
|
104
104
|
text = 'Then Peter went to the store.'
|
105
|
-
expect(described_class.new(
|
105
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Peter"])
|
106
106
|
end
|
107
107
|
|
108
108
|
it 'extracts the proper nouns from a text #010' do
|
109
109
|
text = 'HOW TO COOK VEGETABLES'
|
110
|
-
expect(described_class.new(
|
110
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
111
111
|
end
|
112
112
|
|
113
113
|
it 'extracts the proper nouns from a text #011' do
|
114
114
|
text = 'All Natural Peanut Butter'
|
115
|
-
expect(described_class.new(
|
115
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
116
116
|
end
|
117
117
|
|
118
118
|
it 'extracts the proper nouns from a text #012' do
|
119
119
|
text = 'GOOD CARBS VS. BAD CARBS'
|
120
|
-
expect(described_class.new(
|
120
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
121
121
|
end
|
122
122
|
|
123
123
|
it 'extracts the proper nouns from a text #013' do
|
124
124
|
text = 'Reducing”'
|
125
|
-
expect(described_class.new(
|
125
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
126
126
|
end
|
127
127
|
|
128
128
|
it 'extracts the proper nouns from a text #014' do
|
129
129
|
text = '”'
|
130
|
-
expect(described_class.new(
|
130
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
131
131
|
end
|
132
132
|
|
133
133
|
it 'extracts the proper nouns from a text #015' do
|
134
134
|
text = '“Reducing'
|
135
|
-
expect(described_class.new(
|
135
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq([])
|
136
136
|
end
|
137
137
|
|
138
138
|
it 'extracts the proper nouns from a text #016' do
|
139
139
|
text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management'
|
140
|
-
expect(described_class.new(
|
140
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["Corrigendum", "Council Regulation", "No", "January", "Regulation", "European Regional Development Fund", "European Social Fund", "Cohesion Fund"])
|
141
141
|
end
|
142
142
|
|
143
143
|
it 'extracts the proper nouns from a text #017' do
|
144
144
|
text = 'John'
|
145
|
-
expect(described_class.new(
|
145
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(['John'])
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'extracts the proper nouns from a text #018' do
|
149
|
+
text = 'John and Jane Doe'
|
150
|
+
expect(described_class.new(corpus: corpus, language: 'en').extract(text)).to eq(["John", "Jane Doe"])
|
146
151
|
end
|
147
152
|
end
|
148
153
|
|
149
154
|
context 'German (de)' do
|
150
155
|
it 'extracts the proper nouns from a text #001' do
|
151
156
|
text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.'
|
152
|
-
expect(described_class.new(
|
157
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutschen Bank'])
|
153
158
|
end
|
154
159
|
|
155
160
|
it 'extracts the proper nouns from a text #002' do
|
156
161
|
text = 'Viele Mitarbeiter der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
157
|
-
expect(described_class.new(
|
162
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
158
163
|
end
|
159
164
|
|
160
165
|
it 'extracts the proper nouns from a text #003' do
|
161
166
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
162
|
-
expect(described_class.new(
|
167
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'extracts the proper nouns from a text #004' do
|
166
171
|
text = 'Viele de Mitarbeiters der Deutsche Bank suchen eine andere Arbeitsstelle.'
|
167
|
-
expect(described_class.new(
|
172
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
168
173
|
end
|
169
174
|
|
170
175
|
it 'extracts the proper nouns from a text #005' do
|
171
176
|
text = 'Viele de Mitarbeiters der «Deutsche Bank» suchen eine andere Arbeitsstelle.'
|
172
|
-
expect(described_class.new(
|
177
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
173
178
|
end
|
174
179
|
|
175
180
|
it 'extracts the proper nouns from a text #006' do
|
176
181
|
text = 'Viele de Mitarbeiters der ‹Deutsche Bank› suchen eine andere Arbeitsstelle.'
|
177
|
-
expect(described_class.new(
|
182
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
178
183
|
end
|
179
184
|
|
180
185
|
it 'extracts the proper nouns from a text #007' do
|
181
186
|
text = 'Viele de Mitarbeiters der “Deutsche Bank” suchen eine andere Arbeitsstelle.'
|
182
|
-
expect(described_class.new(
|
187
|
+
expect(described_class.new(corpus: corpus, language: 'de').extract(text)).to eq(['Deutsche Bank'])
|
183
188
|
end
|
184
189
|
end
|
185
190
|
end
|
@@ -4,14 +4,14 @@ RSpec.describe ConfidentialInfoRedactorLite::Hyperlink do
|
|
4
4
|
context '#replace' do
|
5
5
|
it 'replaces the hyperlinks in a string with regular tokens #001' do
|
6
6
|
string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
|
7
|
-
ws = described_class.new
|
8
|
-
expect(ws.replace).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
7
|
+
ws = described_class.new
|
8
|
+
expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit <redacted hyperlink> or <redacted hyperlink> ")
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'replaces the hyperlinks in a string with regular tokens #002' do
|
12
12
|
string = 'The file location is c:\Users\johndoe or d:\Users\john\www'
|
13
|
-
ws = described_class.new
|
14
|
-
expect(ws.replace).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
|
13
|
+
ws = described_class.new
|
14
|
+
expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -12,29 +12,35 @@ describe ConfidentialInfoRedactorLite do
|
|
12
12
|
en_months = %w(january february march april may june july august september october november december)
|
13
13
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
14
14
|
benchmark do
|
15
|
-
extraction = ConfidentialInfoRedactorLite::Extractor.new(
|
16
|
-
expect(extraction).to eq(["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"])
|
15
|
+
extraction = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
|
17
16
|
end
|
18
17
|
benchmark do
|
19
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
18
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
20
19
|
end
|
21
20
|
benchmark do
|
22
|
-
|
21
|
+
text2 = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
22
|
+
ci = ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS)
|
23
|
+
100.times do
|
24
|
+
ci.extract(text2)
|
25
|
+
end
|
23
26
|
end
|
24
27
|
benchmark do
|
25
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
28
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)
|
26
29
|
end
|
27
30
|
benchmark do
|
28
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
31
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)
|
29
32
|
end
|
30
33
|
benchmark do
|
31
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
34
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)
|
32
35
|
end
|
33
36
|
benchmark do
|
34
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
37
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)
|
38
|
+
end
|
39
|
+
benchmark do
|
40
|
+
ConfidentialInfoRedactorLite::Redactor.new(tokens: ["Rabbit-Hole", "Alice", "Firnever", "She", "s", "ALICE", "ESQ", "HEARTHRUG", "m", "It", "Rabdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdeepdand", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliwoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlame AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlptAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlerAliAliAliAliAliAl", "Lof", "Iof", "He", "ve", "Cd", "SHE", "We", "Lory", "Ma", "Dinah", "d", "Mary Ann", "ll", "Blitnurse", "Onlys. Thernurse", "Iss", "st", "B", "se", "Bira", "Billte", "Sas", "Solong-", "Bilfu", "BHo", "Bi", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLaste", "LastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLastLalLastLastLastLastLastll Luced", "They", "Ithing", "Why", "Wha Wha", "Wha", "Sa Wha Wha", "Whe Wha Wha", "Whdo Wha Wha", "Wha Whshe", "L Wha Wha", "Wha Whaht", "Fish-Footman", "Cheshire", "Iwhich", "Marcch", "Iwhichwaswhich", "Marse", "Marc", "March--just", "HE", "Dormouse", "Sh", "Hare", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTinto", "IThis", "THEN--she", "VIII", "s Croquet-Ground", "t", "Que", "ThSo", "Rab", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCseCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCarCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "AliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlperAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliA DAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlithoAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAliAonAliAliAliAliAliAliAliAliAliAlifouAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAlouAlimiAliAliAliAliAliAliAliAliAliAliAliAliAliAli-suAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliAliA", "DAliAliAliAliAlremnding", "S", "Turtle", "WASHIN", "FurseFrench", "FurseFren", "FurseI", "ThFrench", "WAoth", "iFrench", "YouFrench", "ANE", "FurseFrencouFrench", "FurseFr", "FurseFrenff", "England", "France--", "Rabbit", "OLDtheir", "Turtheir", "How", "H", "W", "Hve", "Hr", "Hrf", "H--e", "XI", "Q", "Rabb", "Qher", "WhaThe Kingrts", "Whdddddddddddddddddddddddddddddddddddistdddddddddddddddddddddddto", "M", "it", "e", "Tcccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcccccccnocccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghccclicccccccccccccccccccccccccmacccccccccccccccccccccccccmaccccccccghcco", "Wonderland", "Wderivative", "Gutenberg-tm", "Gutenberg", "Prand", "Gand", "D", "E", "Gphrase", "Projecarsphrase", "Proje", "Projes", "Projecno", "Projecarsphrthphrase", "Projecarsphrliphrase", "Projecag-phrase", "Projeontaiphrase", "Projcopphrase", "Projecarsphephrase", "Projrediphrasetinphrase", "Projerg", "Projecwith", "Projecarsphrr", "Projecarsphh", "ASCII", "Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Li Lexpneed", "Mississippi", "Service", "s EIN", "Foutiat", "Fioat", "Foidat", "Fouincat", "Fou", "Foby", "Foreat", "U.S", "PG"], ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)
|
35
41
|
end
|
36
42
|
# data = StackProf.run(mode: :cpu, interval: 1000) do
|
37
|
-
# ConfidentialInfoRedactorLite::Extractor.new(
|
43
|
+
# ConfidentialInfoRedactorLite::Extractor.new(corpus: CORPUS).extract(text)
|
38
44
|
# end
|
39
45
|
# puts StackProf::Report.new(data).print_text
|
40
46
|
end
|
@@ -46,7 +52,7 @@ describe ConfidentialInfoRedactorLite do
|
|
46
52
|
en_months = %w(january february march april may june july august september october november december)
|
47
53
|
en_month_abbr = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
|
48
54
|
benchmark do
|
49
|
-
ConfidentialInfoRedactorLite::Redactor.new(
|
55
|
+
ConfidentialInfoRedactorLite::Redactor.new(dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)
|
50
56
|
end
|
51
57
|
end
|
52
58
|
end
|
@@ -9,149 +9,149 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
9
9
|
|
10
10
|
describe '#dates' do
|
11
11
|
it 'handles nil as a text argument' do
|
12
|
-
expect(described_class.new(
|
12
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(nil)).to eq('')
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'redacts dates from a text #001' do
|
16
16
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.'
|
17
|
-
expect(described_class.new(
|
17
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.')
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'redacts dates from a text #002' do
|
21
21
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
22
|
-
expect(described_class.new(
|
22
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date>.')
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'redacts dates from a text #003' do
|
26
26
|
text = 'December 5, 2010 - Coca-Cola announced a merger with Pepsi.'
|
27
|
-
expect(described_class.new(
|
27
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('<redacted date> - Coca-Cola announced a merger with Pepsi.')
|
28
28
|
end
|
29
29
|
|
30
30
|
it 'redacts dates from a text #004' do
|
31
31
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
32
|
-
expect(described_class.new(
|
32
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).dates(text)).to eq('The scavenger hunt ends on <redacted date>.')
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'handles nil date objects' do
|
36
36
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
37
|
-
expect(described_class.new(
|
37
|
+
expect(described_class.new(language: 'en', dow: nil, dow_abbr: nil, months: nil, months_abbr: nil).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
38
38
|
end
|
39
39
|
|
40
40
|
it 'handles empty string date objects' do
|
41
41
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
42
|
-
expect(described_class.new(
|
42
|
+
expect(described_class.new(language: 'en', dow: '', dow_abbr: '', months: '', months_abbr: '').dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
43
43
|
end
|
44
44
|
|
45
45
|
it 'handles empty array date objects' do
|
46
46
|
text = 'The scavenger hunt ends on Dec. 31st, 2011.'
|
47
|
-
expect(described_class.new(
|
47
|
+
expect(described_class.new(language: 'en', dow: [], dow_abbr: [], months: [], months_abbr: []).dates(text)).to eq('The scavenger hunt ends on Dec. 31st, 2011.')
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
51
|
describe '#dates_html' do
|
52
52
|
it 'handles nil as a text argument' do
|
53
|
-
expect(described_class.new(
|
53
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(nil)).to eq([])
|
54
54
|
end
|
55
55
|
|
56
56
|
it 'surrounds the redacted dates in spans and return the redacted dates from a text #001' do
|
57
57
|
text = 'On May 1st, 2000 Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.'
|
58
|
-
expect(described_class.new(
|
58
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["On <span class='confidentialDate'>*****</span> Coca-Cola announced a merger with Pepsi that will happen on <span class='confidentialDate'>*****</span>.", ['May 1st, 2000', 'December 15th, 2020']])
|
59
59
|
end
|
60
60
|
|
61
61
|
it 'surrounds the redacted dates in spans and return the redacted dates from a text #002' do
|
62
62
|
text = '2011年12月31日です。'
|
63
|
-
expect(described_class.new(
|
63
|
+
expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, date_text: "*****").dates_html(text)).to eq(["<span class='confidentialDate'>*****</span> です。", ["2011年12月31日"]])
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
67
|
describe '#numbers' do
|
68
68
|
it 'handles nil as a text argument' do
|
69
|
-
expect(described_class.new(
|
69
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(nil)).to eq('')
|
70
70
|
end
|
71
71
|
|
72
72
|
it 'redacts numbers from a text #001' do
|
73
73
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for $200,000,000,000.'
|
74
|
-
expect(described_class.new(
|
74
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on <redacted date> for <redacted number>.')
|
75
75
|
end
|
76
76
|
|
77
77
|
it 'redacts numbers from a text #002' do
|
78
78
|
text = '200 years ago.'
|
79
|
-
expect(described_class.new(
|
79
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number> years ago.')
|
80
80
|
end
|
81
81
|
|
82
82
|
it 'redacts numbers from a text #003' do
|
83
83
|
text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick.'
|
84
|
-
expect(described_class.new(
|
84
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('It was his <redacted number> time, not yet his <redacted number>, not even his <redacted number>. The wood was <redacted number> thick.')
|
85
85
|
end
|
86
86
|
|
87
87
|
it 'redacts numbers from a text #004' do
|
88
88
|
text = 'Checking file of %2'
|
89
|
-
expect(described_class.new(
|
89
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('Checking file of <redacted number>')
|
90
90
|
end
|
91
91
|
|
92
92
|
it 'redacts numbers from a text #005' do
|
93
93
|
text = 'zawiera pliki skompresowane (%2).'
|
94
|
-
expect(described_class.new(
|
94
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('zawiera pliki skompresowane (<redacted number>).')
|
95
95
|
end
|
96
96
|
|
97
97
|
it 'redacts numbers from a text #006' do
|
98
98
|
text = '2134か24か0'
|
99
|
-
expect(described_class.new(
|
99
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq("<redacted number> か <redacted number> か <redacted number>")
|
100
100
|
end
|
101
101
|
|
102
102
|
it 'redacts numbers from a text #007' do
|
103
103
|
text = '100'
|
104
|
-
expect(described_class.new(
|
104
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).numbers(text)).to eq('<redacted number>')
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
108
|
describe '#numbers_html' do
|
109
109
|
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #001' do
|
110
110
|
text = 'It was his 1st) time, not yet his 10th, not even his 2nd. The wood was 3/4" thick. It cost $200,000.'
|
111
|
-
expect(described_class.new(
|
111
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["It was his <span class='confidentialNumber'>*****</span>) time, not yet his <span class='confidentialNumber'>*****</span>, not even his <span class='confidentialNumber'>*****</span>. The wood was <span class='confidentialNumber'>*****</span> thick. It cost <span class='confidentialNumber'>*****</span>.", ["1st", "10th,", "2nd", "3/4\"", "$200,000"]])
|
112
112
|
end
|
113
113
|
|
114
114
|
it 'surrounds the redacted numbers in spans and return the redacted numbers from a text #002' do
|
115
115
|
text = 'プロのミニチュアゴルファー2人のサイン。2人の出身国は別であること。(45ポイント;それぞれが別の大陸出身だった場合、5ボーナスポイント。)'
|
116
|
-
expect(described_class.new(
|
116
|
+
expect(described_class.new(language: 'ja', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: "*****").numbers_html(text)).to eq(["プロのミニチュアゴルファー <span class='confidentialNumber'>*****</span> 人のサイン。 <span class='confidentialNumber'>*****</span> 人の出身国は別であること。( <span class='confidentialNumber'>*****</span> ポイント;それぞれが別の大陸出身だった場合、 <span class='confidentialNumber'>*****</span> ボーナスポイント。)", ["2", "2", "45", "5"]])
|
117
117
|
end
|
118
118
|
end
|
119
119
|
|
120
120
|
describe '#emails' do
|
121
121
|
it 'redacts email addresses from a text #001' do
|
122
122
|
text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.'
|
123
|
-
expect(described_class.new(
|
123
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is <redacted email> or you can try <redacted email>.')
|
124
124
|
end
|
125
125
|
|
126
126
|
it 'redacts email addresses from a text #002' do
|
127
127
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
128
|
-
expect(described_class.new(
|
128
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).emails(text)).to eq('His email is (<redacted email>) or you can try (<redacted email>).')
|
129
129
|
end
|
130
130
|
end
|
131
131
|
|
132
132
|
describe '#emails_html' do
|
133
133
|
it 'surrounds the redacted emails in spans and return the redacted emails from a text #001' do
|
134
134
|
text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).'
|
135
|
-
expect(described_class.new(
|
135
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****").emails_html(text)).to eq(["His email is (<span class='confidentialEmail'><redacted email></span>) or you can try (<span class='confidentialEmail'><redacted email></span>).", ["john@gmail.com", "k.light@tuv.eu.us"]])
|
136
136
|
end
|
137
137
|
end
|
138
138
|
|
139
139
|
describe '#hyperlinks' do
|
140
140
|
it 'redacts hyperlinks from a text #001' do
|
141
141
|
text = 'Visit https://www.tm-town.com for more info.'
|
142
|
-
expect(described_class.new(
|
142
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
|
143
143
|
end
|
144
144
|
|
145
145
|
it 'redacts hyperlinks from a text #002' do
|
146
146
|
text = 'Visit www.tm-town.com for more info.'
|
147
|
-
expect(described_class.new(
|
147
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks(text)).to eq('Visit <redacted hyperlink> for more info.')
|
148
148
|
end
|
149
149
|
end
|
150
150
|
|
151
151
|
describe '#hyperlinks_html' do
|
152
152
|
it 'surrounds the redacted hyperlinks in spans and return the redacted hyperlinks from a text #001' do
|
153
153
|
text = 'Visit https://www.tm-town.com for more info or https://www.google.com.'
|
154
|
-
expect(described_class.new(
|
154
|
+
expect(described_class.new(language: 'en', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, token_text: "*****", hyperlink_text: "*****", email_text: "*****").hyperlinks_html(text)).to eq(["Visit <span class='confidentialHyperlinks'>*****</span> for more info or <span class='confidentialHyperlinks'>*****</span>.", ["https://www.tm-town.com", "https://www.google.com"]])
|
155
155
|
end
|
156
156
|
end
|
157
157
|
|
@@ -159,13 +159,13 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
159
159
|
it 'redacts tokens from a text #001' do
|
160
160
|
tokens = ['Coca-Cola', 'Pepsi']
|
161
161
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
162
|
-
expect(described_class.new(
|
162
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on December 15th, 2020 for $200,000,000,000.')
|
163
163
|
end
|
164
164
|
|
165
165
|
it 'redacts tokens from a text #002' do
|
166
166
|
tokens = ['Coca-Cola', 'Pepsi']
|
167
167
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
168
|
-
expect(described_class.new(
|
168
|
+
expect(described_class.new(language: 'en', tokens: tokens, token_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).proper_nouns(text)).to eq('***** announced a merger with ***** that will happen on on December 15th, 2020 for $200,000,000,000.')
|
169
169
|
end
|
170
170
|
end
|
171
171
|
|
@@ -173,7 +173,7 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
173
173
|
it 'redacts all confidential information from a text #001' do
|
174
174
|
tokens = ['Coca-Cola', 'Pepsi']
|
175
175
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.'
|
176
|
-
expect(described_class.new(
|
176
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on on <redacted date> for <redacted number>.')
|
177
177
|
end
|
178
178
|
|
179
179
|
it 'redacts all confidential information from a text #002' do
|
@@ -234,37 +234,37 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
234
234
|
|
235
235
|
Don’t forget to use your imagination and creativity!
|
236
236
|
EOF
|
237
|
-
tokens = ConfidentialInfoRedactorLite::Extractor.new(
|
238
|
-
expect(described_class.new(
|
237
|
+
tokens = ConfidentialInfoRedactorLite::Extractor.new(corpus: corpus).extract(text)
|
238
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq(" <redacted>\n\n <redacted> is hosting the <redacted number> <redacted>. So get out your putter and your camera and see if you have what it takes. Are you a <redacted>?\n\n <redacted>: <redacted number>) <redacted> of <redacted number> professional miniature golfers, each from a different country. (<redacted number> points; <redacted number> bonus points if the professional miniature golfers are also from <redacted number> different continents) <redacted number>) <redacted> of yourself next to each obstacle in our list of the Top <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points for each obstacle that exactly matches the one pictured in the article) <redacted number>) <redacted> your own full-size miniature golf hole. (<redacted number> points; up to <redacted number> bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) <redacted number>) <redacted> of yourself making a hole-in-one on two consecutive miniature golf holes. <redacted> video must be one continuous shot with no editing. (<redacted number> points) <redacted number>) <redacted> of yourself with the <redacted> mascot. (<redacted number> points; <redacted number> bonus points if you are wearing a <redacted> t-shirt) <redacted number>) <redacted> of yourself with the completed <redacted> wobblehead. (<redacted number> points; <redacted number> bonus points if the picture is taken at a miniature golf course) <redacted number>) <redacted> of a completed scorecard from a round of miniature golf. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> of completed scorecards from <redacted number> different miniature golf courses. <redacted> round of golf must have taken place after the start of this scavenger hunt. (<redacted number> points) <redacted number>) <redacted> an entry to the <redacted number> <redacted>. (<redacted number> points; <redacted number> bonus points if your entry gets more than <redacted number> votes) <redacted number>) <redacted> from the <redacted> app showing a 9-hole score below par. (<redacted number> points) <redacted number>) <redacted> from the <redacted> app showing that you have successfully unlocked all of the holes in the game. (<redacted number> points) <redacted number>) <redacted> of the <redacted> wobblehead at a <redacted>. (<redacted number> points) <redacted number>) <redacted> and submit the <redacted> ‘Practice <redacted>’ and ‘Final <redacted>’ for any one of the <redacted> math or physics lessons. (<redacted number> points; <redacted number> bonus points if you complete two lessons) <redacted number>) <redacted> of yourself with at least <redacted number> different colored miniature golf balls. (<redacted number> points; <redacted number> bonus points for each additional color {limit of <redacted number> bonus points}) <redacted number>) <redacted> of yourself with a famous golfer or miniature golfer. (<redacted number> points; <redacted number> bonus points if the golfer is on the <redacted> tour <redacted> you are wearing a <redacted> t-shirt in the picture) <redacted number>) <redacted> of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (<redacted number> points) <redacted number>) <redacted> of yourself successfully making a trick miniature golf shot. (<redacted number> points; up to <redacted number> bonus points available depending on the difficulty and complexity of the trick shot)\n\n\n Prizes: <redacted number> <redacted> <redacted>\n\n <redacted>\n (<redacted number> <redacted number> <redacted> - <redacted>)\n\n <redacted> team will judge the scavenger hunt and all decisions will be final. <redacted> is sponsoring it. <redacted> scavenger hunt is open to anyone and everyone. <redacted> scavenger hunt ends on <redacted date>.\n\n <redacted> enter the scavenger hunt, send an email to info <redacted> putterking <redacted> com with the subject line: \"<redacted>\". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as <redacted>, <redacted>, <redacted>, <redacted>, etc. for your submissions.\n\n <redacted> entering the <redacted>, you allow <redacted> to use or link to any of the pictures or videos you submit for advertisements and promotions.\n\n Don’t forget to use your imagination and creativity!\n")
|
239
239
|
end
|
240
240
|
|
241
241
|
it 'redacts all confidential information from a text #003' do
|
242
242
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
243
243
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
244
|
-
expect(described_class.new(
|
244
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for <redacted number>. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
245
245
|
end
|
246
246
|
|
247
247
|
it 'redacts all confidential information from a text #004' do
|
248
248
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
249
249
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
250
|
-
expect(described_class.new(
|
250
|
+
expect(described_class.new(language: 'en', tokens: tokens, ignore_numbers: true, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('<redacted> announced a merger with <redacted> that will happen on <redacted date> for $200,000,000,000. Please contact <redacted> at <redacted email> or visit <redacted hyperlink>.')
|
251
251
|
end
|
252
252
|
|
253
253
|
it 'redacts all confidential information from a text #005' do
|
254
254
|
tokens = ['Coca-Cola', 'Pepsi', 'John Smith']
|
255
255
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.'
|
256
|
-
expect(described_class.new(
|
256
|
+
expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.')
|
257
257
|
end
|
258
258
|
|
259
259
|
it 'redacts all confidential information from a text #006' do
|
260
260
|
tokens = ['Trans']
|
261
261
|
text = 'My Transformation - avoid Trans.'
|
262
|
-
expect(described_class.new(
|
262
|
+
expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).redact(text)).to eq('My Transformation - avoid *****.')
|
263
263
|
end
|
264
264
|
|
265
265
|
it 'redacts all confidential information from a text #007' do
|
266
266
|
text = 'これはjohn@gmail.comかk.light@tuv.eu.usかhttps://www.tm-town.comです.'
|
267
|
-
expect(described_class.new(
|
267
|
+
expect(described_class.new(language: 'ja', tokens: nil, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****', dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr).hyperlinks_html(text)[1]).to eq(["https://www.tm-town.com"])
|
268
268
|
end
|
269
269
|
end
|
270
270
|
|
@@ -272,25 +272,25 @@ RSpec.describe ConfidentialInfoRedactorLite::Redactor do
|
|
272
272
|
it 'redacts all confidential information from a text #001' do
|
273
273
|
tokens = ['Coca-Cola', 'Pepsi']
|
274
274
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
275
|
-
expect(described_class.new(
|
275
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, number_text: '*****', hyperlink_text: '*****', email_text: '*****', date_text: '*****', token_text: '*****').redact_html(text)).to eq("Coca-Cola announced a merger with Pepsi that will happen on on <span class='confidentialDate'>*****</span> for <span class='confidentialNumber'>*****</span>. Find out more at <span class='confidentialHyperlinks'>*****</span> or contact <span class='confidentialEmail'>*****</span>.")
|
276
276
|
end
|
277
277
|
|
278
278
|
it 'redacts all confidential information from a text #002' do
|
279
279
|
tokens = ['Coca-Cola', 'Pepsi']
|
280
280
|
text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000. Find out more at https://www.merger.com or contact john@merger.com.'
|
281
|
-
expect(described_class.new(
|
281
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("***** announced a merger with ***** that will happen on on **date** for **number**. Find out more at **url** or contact **email**.")
|
282
282
|
end
|
283
283
|
|
284
284
|
it 'redacts all confidential information from a text #003' do
|
285
285
|
tokens = ['CLA']
|
286
286
|
text = 'LEGAL DISCLAIMER - CLA will not be held reponsible for changes.'
|
287
|
-
expect(described_class.new(
|
287
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact(text)).to eq("LEGAL DISCLAIMER - ***** will not be held reponsible for changes.")
|
288
288
|
end
|
289
289
|
|
290
290
|
it 'redacts all confidential information from a text #004' do
|
291
291
|
tokens = []
|
292
292
|
text = '1984 was a good year.'
|
293
|
-
expect(described_class.new(
|
293
|
+
expect(described_class.new(language: 'en', tokens: tokens, dow: en_dow, dow_abbr: en_dow_abbr, months: en_months, months_abbr: en_month_abbr, email_text: '**email**', number_text: '**number**', date_text: '**date**', hyperlink_text: '**url**', token_text: '*****').redact_html(text)).to eq("<span class='confidentialNumber'>**number**</span> was a good year.")
|
294
294
|
end
|
295
295
|
end
|
296
296
|
end
|