top_secret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +402 -0
- data/Rakefile +10 -0
- data/lib/top_secret/constants.rb +25 -0
- data/lib/top_secret/error.rb +5 -0
- data/lib/top_secret/filters/ner.rb +37 -0
- data/lib/top_secret/filters/regex.rb +32 -0
- data/lib/top_secret/result.rb +24 -0
- data/lib/top_secret/text.rb +89 -0
- data/lib/top_secret/version.rb +5 -0
- data/lib/top_secret.rb +44 -0
- data/sig/top_secret.rbs +4 -0
- metadata +92 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d570b8ecd9cb5ab35f59dc688a6a13a749cebf1abfcbce36906f12d1d8452189
|
4
|
+
data.tar.gz: 8b5d639506a7ecd6bbb548e245db54384a2d16c0b47bbd564230c4140639c05d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fef5120cc93ac1772270816788ee9bd3a7141779f300a839ecb7d8d35d228c7539e33bb55cac1ff97d975b35251d46913020879d985dad75655de385a47b31ca
|
7
|
+
data.tar.gz: 5acc9bb4f77210d0ae804ee50472ec75952f12812892c6d304feb4209ee152a5d3a3740fe938d17dd47cb664945d90dc73ddfa1578b2c4f1400a1ef23d8e93bb
|
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) Steve Polito and thoughtbot, inc.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,402 @@
|
|
1
|
+
# Top Secret
|
2
|
+
|
3
|
+
Filter sensitive information from free text before sending it to external services or APIs, such as Chatbots.
|
4
|
+
|
5
|
+
By default it filters the following:
|
6
|
+
|
7
|
+
- Credit cards
|
8
|
+
- Emails
|
9
|
+
- Phone numbers
|
10
|
+
- Social security numbers
|
11
|
+
- People's names
|
12
|
+
- Locations
|
13
|
+
|
14
|
+
However, you can add your own [custom filters](#custom-filters).
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Install the gem and add to the application's Gemfile by executing:
|
19
|
+
|
20
|
+
```bash
|
21
|
+
bundle add top_secret
|
22
|
+
```
|
23
|
+
|
24
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
25
|
+
|
26
|
+
```bash
|
27
|
+
gem install top_secret
|
28
|
+
```
|
29
|
+
|
30
|
+
> [!IMPORTANT]
|
31
|
+
> Top Secret depends on [MITIE Ruby][], which depends on [MITIE][].
|
32
|
+
>
|
33
|
+
> You'll need to download and extract [ner_model.dat][] first.
|
34
|
+
|
35
|
+
By default, Top Secret assumes the file will live at the root of your project, but this can be configured.
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
TopSecret.configure do |config|
|
39
|
+
config.model_path = "path/to/ner_model.dat"
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
## Default Filters
|
44
|
+
|
45
|
+
Top Secret ships with a set of filters to detect and redact the most common types of sensitive information.
|
46
|
+
|
47
|
+
You can [override](#overriding-the-default-filters-1), [disable](#disabling-a-default-filter-1), or [add](#adding-new-default-filters) to this list as needed.
|
48
|
+
|
49
|
+
By default, the following filters are enabled
|
50
|
+
|
51
|
+
**`credit_card_filter`**
|
52
|
+
|
53
|
+
Matches common credit card formats
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
result = TopSecret::Text.filter("My card number is 4242-4242-4242-4242")
|
57
|
+
result.output
|
58
|
+
|
59
|
+
# => "My card number is [CREDIT_CARD_1]"
|
60
|
+
```
|
61
|
+
|
62
|
+
**`email_filter`**
|
63
|
+
|
64
|
+
Matches email addresses
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
result = TopSecret::Text.filter("Email me at ralph@thoughtbot.com")
|
68
|
+
result.output
|
69
|
+
|
70
|
+
# => "Email me at [EMAIL_1]"
|
71
|
+
```
|
72
|
+
|
73
|
+
**`phone_number_filter`**
|
74
|
+
|
75
|
+
Matches phone numbers
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
result = TopSecret::Text.filter("Call me at 555-555-5555")
|
79
|
+
result.output
|
80
|
+
|
81
|
+
# => "Call me at [PHONE_NUMBER_1]"
|
82
|
+
```
|
83
|
+
|
84
|
+
**`ssn_filter`**
|
85
|
+
|
86
|
+
Matches U.S. Social Security numbers
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
result = TopSecret::Text.filter("My SSN is 123-45-6789")
|
90
|
+
result.output
|
91
|
+
|
92
|
+
# => "My SSN is [SSN_1]"
|
93
|
+
```
|
94
|
+
|
95
|
+
**`people_filter`**
|
96
|
+
|
97
|
+
Detects names of people (NER-based)
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
result = TopSecret::Text.filter("Ralph is joining the meeting")
|
101
|
+
result.output
|
102
|
+
|
103
|
+
# => "[PERSON_1] is joining the meeting"
|
104
|
+
```
|
105
|
+
|
106
|
+
**`location_filter`**
|
107
|
+
|
108
|
+
Detects location names (NER-based)
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
result = TopSecret::Text.filter("Let's meet in Boston")
|
112
|
+
result.output
|
113
|
+
|
114
|
+
# => "Let's meet in [LOCATION_1]"
|
115
|
+
```
|
116
|
+
|
117
|
+
## Usage
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
TopSecret::Text.filter("Ralph can be reached at ralph@thoughtbot.com")
|
121
|
+
```
|
122
|
+
|
123
|
+
This will return
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
<TopSecret::Result
|
127
|
+
@input="Ralph can be reached at ralph@thoughtbot.com",
|
128
|
+
@mapping={:EMAIL_1=>"ralph@thoughtbot.com", :PERSON_1=>"Ralph"},
|
129
|
+
@output="[PERSON_1] can be reached at [EMAIL_1]"
|
130
|
+
>
|
131
|
+
```
|
132
|
+
|
133
|
+
View the original text
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
result.input
|
137
|
+
|
138
|
+
# => "Ralph can be reached at ralph@thoughtbot.com"
|
139
|
+
```
|
140
|
+
|
141
|
+
View the filtered text
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
result.output
|
145
|
+
|
146
|
+
# => "[PERSON_1] can be reached at [EMAIL_1]"
|
147
|
+
```
|
148
|
+
|
149
|
+
View the mapping
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
result.mapping
|
153
|
+
|
154
|
+
# => {:EMAIL_1=>"ralph@thoughtbot.com", :PERSON_1=>"Ralph"}
|
155
|
+
```
|
156
|
+
|
157
|
+
### Advanced Examples
|
158
|
+
|
159
|
+
#### Overriding the default filters
|
160
|
+
|
161
|
+
When overriding or [disabling](#disabling-a-default-filter-1) a [default filter](#default-filters), you must map to the correct key.
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
regex_filter = TopSecret::Filters::Regex.new(label: "EMAIL_ADDRESS", regex: /\b\w+\[at\]\w+\.\w+\b/)
|
165
|
+
ner_filter = TopSecret::Filters::NER.new(label: "NAME", tag: :person, min_confidence_score: 0.25)
|
166
|
+
|
167
|
+
TopSecret::Text.filter("Ralph can be reached at ralph[at]thoughtbot.com", filters: {
|
168
|
+
email_filter: regex_filter,
|
169
|
+
people_filter: ner_filter
|
170
|
+
})
|
171
|
+
```
|
172
|
+
|
173
|
+
This will return
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
<TopSecret::Result
|
177
|
+
@input="Ralph can be reached at ralph[at]thoughtbot.com",
|
178
|
+
@mapping={:EMAIL_ADDRESS_1=>"ralph[at]thoughtbot.com", :NAME_1=>"Ralph", :NAME_2=>"ralph["},
|
179
|
+
@output="[NAME_1] can be reached at [EMAIL_ADDRESS_1]"
|
180
|
+
>
|
181
|
+
```
|
182
|
+
|
183
|
+
#### Disabling a default filter
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
TopSecret::Text.filter("Ralph can be reached at ralph@thoughtbot.com", filters: {
|
187
|
+
email_filter: nil,
|
188
|
+
people_filter: nil
|
189
|
+
})
|
190
|
+
```
|
191
|
+
|
192
|
+
This will return
|
193
|
+
|
194
|
+
```ruby
|
195
|
+
<TopSecret::Result
|
196
|
+
@input="Ralph can be reached at ralph@thoughtbot.com",
|
197
|
+
@mapping={},
|
198
|
+
@output="Ralph can be reached at ralph@thoughtbot.com"
|
199
|
+
>
|
200
|
+
```
|
201
|
+
|
202
|
+
### Custom Filters
|
203
|
+
|
204
|
+
#### Adding new [Regex filters][]
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
ip_address_filter = TopSecret::Filters::Regex.new(
|
208
|
+
label: "IP_ADDRESS",
|
209
|
+
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
|
210
|
+
)
|
211
|
+
|
212
|
+
TopSecret::Text.filter("Ralph's IP address is 192.168.1.1", filters: {
|
213
|
+
ip_address_filter: ip_address_filter
|
214
|
+
})
|
215
|
+
```
|
216
|
+
|
217
|
+
This will return
|
218
|
+
|
219
|
+
```ruby
|
220
|
+
<TopSecret::Result
|
221
|
+
@input="Ralph's IP address is 192.168.1.1",
|
222
|
+
@mapping={:PERSON_1=>"Ralph", :IP_ADDRESS_1=>"192.168.1.1"},
|
223
|
+
@output="[PERSON_1]'s IP address is [IP_ADDRESS_1]"
|
224
|
+
>
|
225
|
+
```
|
226
|
+
|
227
|
+
#### Adding new [NER filters][]
|
228
|
+
|
229
|
+
Since [MITIE Ruby][] has an API for [training][train] a model, you're free to add new NER filters.
|
230
|
+
|
231
|
+
```ruby
|
232
|
+
language_filter = TopSecret::Filters::NER.new(
|
233
|
+
label: "LANGUAGE",
|
234
|
+
tag: :language,
|
235
|
+
min_confidence_score: 0.75
|
236
|
+
)
|
237
|
+
|
238
|
+
TopSecret::Text.filter("Ralph's favorite programming language is Ruby.", filters: {
|
239
|
+
language_filter: language_filter
|
240
|
+
})
|
241
|
+
```
|
242
|
+
|
243
|
+
This will return
|
244
|
+
|
245
|
+
```ruby
|
246
|
+
<TopSecret::Result
|
247
|
+
@input="Ralph's favorite programming language is Ruby.",
|
248
|
+
@mapping={:PERSON_1=>"Ralph", :LANGUAGE_1=>"Ruby"},
|
249
|
+
@output="[PERSON_1]'s favorite programming language is [LANGUAGE_1]"
|
250
|
+
>
|
251
|
+
```
|
252
|
+
|
253
|
+
## How Filters Work
|
254
|
+
|
255
|
+
Top Secret uses two types of filters to detect and redact sensitive information:
|
256
|
+
|
257
|
+
### `TopSecret::Filters::Regex`
|
258
|
+
|
259
|
+
`Regex` filters use regular expressions to find patterns in text.
|
260
|
+
They are useful for structured data like credit card numbers, emails, or IP addresses.
|
261
|
+
|
262
|
+
```ruby
|
263
|
+
regex_filter = TopSecret::Filters::Regex.new(
|
264
|
+
label: "IP_ADDRESS",
|
265
|
+
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
|
266
|
+
)
|
267
|
+
|
268
|
+
result = TopSecret::Text.filter("Server IP: 192.168.1.1", filters: {
|
269
|
+
ip_address_filter: regex_filter
|
270
|
+
})
|
271
|
+
|
272
|
+
result.output
|
273
|
+
# => "Server IP: [IP_ADDRESS_1]"
|
274
|
+
```
|
275
|
+
|
276
|
+
### `TopSecret::Filters::NER`
|
277
|
+
|
278
|
+
`NER` (Named Entity Recognition) filters use the [MITIE][] library to detect entities like people, locations, and other categories based on trained language models.
|
279
|
+
They are ideal for free-form text where patterns are less predictable.
|
280
|
+
|
281
|
+
```ruby
|
282
|
+
ner_filter = TopSecret::Filters::NER.new(
|
283
|
+
label: "PERSON",
|
284
|
+
tag: :person,
|
285
|
+
min_confidence_score: 0.25
|
286
|
+
)
|
287
|
+
|
288
|
+
result = TopSecret::Text.filter("Ralph and Ruby work at thoughtbot.", filters: {
|
289
|
+
people_filter: ner_filter
|
290
|
+
})
|
291
|
+
|
292
|
+
result.output
|
293
|
+
# => "[PERSON_1] and [PERSON_2] work at thoughtbot."
|
294
|
+
```
|
295
|
+
|
296
|
+
`NER` filters match based on the tag you specify (`:person`, `:location`, etc.) and only include matches with a confidence score above `min_confidence_score`.
|
297
|
+
|
298
|
+
#### Supported NER Tags
|
299
|
+
|
300
|
+
By default, Top Secret only ships with `NER` filters for two entity types:
|
301
|
+
|
302
|
+
- `:person`
|
303
|
+
- `:location`
|
304
|
+
|
305
|
+
If you need other tags you can [train your own MITIE model][train] and add custom NER filters:
|
306
|
+
|
307
|
+
## Configuration
|
308
|
+
|
309
|
+
### Overriding the model path
|
310
|
+
|
311
|
+
```ruby
|
312
|
+
TopSecret.configure do |config|
|
313
|
+
config.model_path = "path/to/ner_model.dat"
|
314
|
+
end
|
315
|
+
```
|
316
|
+
|
317
|
+
### Overriding the confidence score
|
318
|
+
|
319
|
+
```ruby
|
320
|
+
TopSecret.configure do |config|
|
321
|
+
config.min_confidence_score = 0.75
|
322
|
+
end
|
323
|
+
```
|
324
|
+
|
325
|
+
### Overriding the default filters
|
326
|
+
|
327
|
+
```ruby
|
328
|
+
TopSecret.configure do |config|
|
329
|
+
config.default_filters.email_filter = TopSecret::Filters::Regex.new(
|
330
|
+
label: "EMAIL_ADDRESS",
|
331
|
+
regex: /\b\w+\[at\]\w+\.\w+\b/
|
332
|
+
)
|
333
|
+
end
|
334
|
+
```
|
335
|
+
|
336
|
+
### Disabling a default filter
|
337
|
+
|
338
|
+
```ruby
|
339
|
+
TopSecret.configure do |config|
|
340
|
+
config.default_filters.email_filter = nil
|
341
|
+
end
|
342
|
+
```
|
343
|
+
|
344
|
+
### Adding new default filters
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
TopSecret.configure do |config|
|
348
|
+
config.default_filters.ip_address_filter = TopSecret::Filters::Regex.new(
|
349
|
+
label: "IP_ADDRESS",
|
350
|
+
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
|
351
|
+
)
|
352
|
+
end
|
353
|
+
```
|
354
|
+
|
355
|
+
## Development
|
356
|
+
|
357
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
358
|
+
|
359
|
+
> [!IMPORTANT]
|
360
|
+
> Top Secret depends on [MITIE Ruby][], which depends on [MITIE][].
|
361
|
+
>
|
362
|
+
> You'll need to download and extract [ner_model.dat][] first, and place it in the root of this project.
|
363
|
+
|
364
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
365
|
+
|
366
|
+
## Contributing
|
367
|
+
|
368
|
+
Bug reports and pull requests are welcome on GitHub at [https://github.com/thoughtbot/top_secret](https://github.com/thoughtbot/top_secret). This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/thoughtbot/top_secret/blob/main/CODE_OF_CONDUCT.md).
|
369
|
+
|
370
|
+
## License
|
371
|
+
|
372
|
+
Open source templates are Copyright (c) thoughtbot, inc.
|
373
|
+
It contains free software that may be redistributed under the terms specified in the [LICENSE](https://github.com/thoughtbot/top_secret/blob/main/LICENSE.txt) file.
|
374
|
+
|
375
|
+
## Code of Conduct
|
376
|
+
|
377
|
+
Everyone interacting in the TopSecret project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/thoughtbot/top_secret/blob/main/CODE_OF_CONDUCT.md).
|
378
|
+
|
379
|
+
<!-- START /templates/footer.md -->
|
380
|
+
|
381
|
+
## About thoughtbot
|
382
|
+
|
383
|
+

|
384
|
+
|
385
|
+
This repo is maintained and funded by thoughtbot, inc.
|
386
|
+
The names and logos for thoughtbot are trademarks of thoughtbot, inc.
|
387
|
+
|
388
|
+
We love open source software!
|
389
|
+
See [our other projects][community].
|
390
|
+
We are [available for hire][hire].
|
391
|
+
|
392
|
+
[community]: https://thoughtbot.com/community?utm_source=github
|
393
|
+
[hire]: https://thoughtbot.com/hire-us?utm_source=github
|
394
|
+
|
395
|
+
<!-- END /templates/footer.md -->
|
396
|
+
|
397
|
+
[MITIE Ruby]: https://github.com/ankane/mitie-ruby
|
398
|
+
[MITIE]: https://github.com/mit-nlp/MITIE
|
399
|
+
[ner_model.dat]: https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2
|
400
|
+
[train]: https://github.com/ankane/mitie-ruby?tab=readme-ov-file#training
|
401
|
+
[Regex filters]: https://github.com/thoughtbot/top_secret/blob/main/lib/top_secret/filters/regex.rb
|
402
|
+
[NER filters]: https://github.com/thoughtbot/top_secret/blob/main/lib/top_secret/filters/ner.rb
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
# @return [Regexp] Matches credit card numbers
|
5
|
+
CREDIT_CARD_REGEX = /
|
6
|
+
\b[3456]\d{15}\b |
|
7
|
+
\b[3456]\d{3}(?:[\s+-]\d{4}){3}\b
|
8
|
+
/x
|
9
|
+
|
10
|
+
# @return [Regexp] Matches valid email addresses
|
11
|
+
EMAIL_REGEX = %r{
|
12
|
+
[a-zA-Z0-9.!\#$%&'*+/=?^_`{|}~-]+@
|
13
|
+
[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
|
14
|
+
(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
|
15
|
+
}x
|
16
|
+
|
17
|
+
# @return [Regexp] Matches phone numbers with optional country code
|
18
|
+
PHONE_REGEX = /\b(?:\+\d{1,2}\s)?\(?\d{3}\)?[\s+.-]\d{3}[\s+.-]\d{4}\b/
|
19
|
+
|
20
|
+
# @return [Regexp] Matches Social Security Numbers in common formats
|
21
|
+
SSN_REGEX = /\b\d{3}[\s+-]\d{2}[\s+-]\d{4}\b/
|
22
|
+
|
23
|
+
# @return [Float] The minimum confidence score for NER filtering
|
24
|
+
MIN_CONFIDENCE_SCORE = 0.5
|
25
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
module Filters
|
5
|
+
# Applies Named Entity Recognition (NER) filtering based on tag and confidence score.
|
6
|
+
class NER
|
7
|
+
# @return [String] The label applied to matching entities
|
8
|
+
attr_reader :label
|
9
|
+
|
10
|
+
# @param label [String] The label for redacted entities
|
11
|
+
# @param tag [Symbol, String] The NER tag to match (e.g., :person, :location)
|
12
|
+
# @param min_confidence_score [Float, nil] Minimum score required for a match (defaults to TopSecret.min_confidence_score)
|
13
|
+
def initialize(label:, tag:, min_confidence_score: nil)
|
14
|
+
@label = label
|
15
|
+
@tag = tag.upcase.to_s
|
16
|
+
@min_confidence_score = min_confidence_score || TopSecret.min_confidence_score
|
17
|
+
end
|
18
|
+
|
19
|
+
# Filters and extracts entity texts matching the tag and score threshold.
|
20
|
+
#
|
21
|
+
# @param entities [Array<Hash>] List of entity hashes with keys :tag, :score, and :text
|
22
|
+
# @return [Array<String>] Matched entity texts
|
23
|
+
def call(entities)
|
24
|
+
tags = entities.filter { _1.fetch(:tag) == tag && _1.fetch(:score) >= min_confidence_score }
|
25
|
+
tags.map { _1.fetch(:text) }
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# @return [String] The expected tag (uppercased)
|
31
|
+
attr_reader :tag
|
32
|
+
|
33
|
+
# @return [Float] Minimum confidence score for matches
|
34
|
+
attr_reader :min_confidence_score
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
module Filters
|
5
|
+
# Applies regex-based filtering to extract matching text from input.
|
6
|
+
class Regex
|
7
|
+
# @return [String] The label applied to matching content
|
8
|
+
attr_reader :label
|
9
|
+
|
10
|
+
# @param label [String] The label for redacted content
|
11
|
+
# @param regex [Regexp] The regular expression used to match content
|
12
|
+
def initialize(label:, regex:)
|
13
|
+
@label = label
|
14
|
+
@regex = regex
|
15
|
+
end
|
16
|
+
|
17
|
+
# Applies the regex to the input and returns all matches.
|
18
|
+
#
|
19
|
+
# @param input [String] The input text to scan
|
20
|
+
# @return [Array<String>] All matches found
|
21
|
+
def call(input)
|
22
|
+
input.scan(regex)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# @return [Regexp] The regular expression used for matching
|
28
|
+
# @private
|
29
|
+
attr_reader :regex
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
# Holds the result of a redaction operation.
|
5
|
+
class Result
|
6
|
+
# @return [String] The original unredacted input
|
7
|
+
attr_reader :input
|
8
|
+
|
9
|
+
# @return [String] The redacted output
|
10
|
+
attr_reader :output
|
11
|
+
|
12
|
+
# @return [Hash] Mapping of redacted labels to matched values
|
13
|
+
attr_reader :mapping
|
14
|
+
|
15
|
+
# @param input [String] The original text
|
16
|
+
# @param output [String] The redacted text
|
17
|
+
# @param mapping [Hash] Map of labels to matched values
|
18
|
+
def initialize(input, output, mapping)
|
19
|
+
@input = input
|
20
|
+
@output = output
|
21
|
+
@mapping = mapping
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
# Processes text to identify and redact sensitive information using configured filters.
|
5
|
+
class Text
|
6
|
+
# @param input [String] The original text to be filtered
|
7
|
+
# @param filters [Hash, nil] Optional set of filters to override the defaults
|
8
|
+
def initialize(input, filters: TopSecret.default_filters)
|
9
|
+
@input = input
|
10
|
+
@output = input.dup
|
11
|
+
@mapping = {}
|
12
|
+
|
13
|
+
@model = Mitie::NER.new(TopSecret.model_path)
|
14
|
+
@doc = @model.doc(@output)
|
15
|
+
@entities = @doc.entities
|
16
|
+
|
17
|
+
@filters = filters
|
18
|
+
end
|
19
|
+
|
20
|
+
# Convenience method to create an instance and filter input
|
21
|
+
#
|
22
|
+
# @param input [String] The text to filter
|
23
|
+
# @param filters [Hash] Optional filters to override defaults
|
24
|
+
# @return [Result] The filtered result
|
25
|
+
def self.filter(input, filters: {})
|
26
|
+
new(input, filters:).filter
|
27
|
+
end
|
28
|
+
|
29
|
+
# Applies configured filters to the input, redacting matches and building a mapping.
|
30
|
+
#
|
31
|
+
# @return [Result] Contains original input, redacted output, and mapping of labels to values
|
32
|
+
# @raise [Error] If an unsupported filter is encountered
|
33
|
+
def filter
|
34
|
+
TopSecret.default_filters.merge(filters).compact.each_value do |filter|
|
35
|
+
values = case filter
|
36
|
+
when TopSecret::Filters::Regex
|
37
|
+
filter.call(input)
|
38
|
+
when TopSecret::Filters::NER
|
39
|
+
filter.call(entities)
|
40
|
+
else
|
41
|
+
raise Error, "Unsupported filter. Expected TopSecret::Filters::Regex or TopSecret::Filters::NER, but got #{filter.class}"
|
42
|
+
end
|
43
|
+
build_mapping(values, label: filter.label)
|
44
|
+
end
|
45
|
+
|
46
|
+
substitute_text
|
47
|
+
|
48
|
+
Result.new(input, output, mapping)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
# @return [String] Original unredacted input text
|
54
|
+
attr_reader :input
|
55
|
+
|
56
|
+
# @return [String] Output with sensitive information redacted
|
57
|
+
attr_reader :output
|
58
|
+
|
59
|
+
# @return [Hash] Mapping from redaction labels to original values
|
60
|
+
attr_reader :mapping
|
61
|
+
|
62
|
+
# @return [Array<Hash>] Named entities extracted by MITIE
|
63
|
+
attr_reader :entities
|
64
|
+
|
65
|
+
# @return [Hash] Active filters used for redaction
|
66
|
+
attr_reader :filters
|
67
|
+
|
68
|
+
# Builds the mapping of label keys to matched values, indexed uniquely.
|
69
|
+
#
|
70
|
+
# @param values [Array<String>] Values matched by a filter
|
71
|
+
# @param label [String] Label identifying the filter type
|
72
|
+
# @return [void]
|
73
|
+
def build_mapping(values, label:)
|
74
|
+
values.uniq.each.with_index(1) do |value, index|
|
75
|
+
filter = "#{label}_#{index}"
|
76
|
+
mapping.merge!({filter.to_sym => value})
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Substitutes matched values in the output text with their label placeholders.
|
81
|
+
#
|
82
|
+
# @return [void]
|
83
|
+
def substitute_text
|
84
|
+
mapping.each do |filter, value|
|
85
|
+
output.gsub! value, "[#{filter}]"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/top_secret.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# dependencies
|
4
|
+
require "active_support/configurable"
|
5
|
+
require "active_support/ordered_options"
|
6
|
+
require "mitie"
|
7
|
+
|
8
|
+
# modules
|
9
|
+
require_relative "top_secret/version"
|
10
|
+
require_relative "top_secret/constants"
|
11
|
+
require_relative "top_secret/filters/ner"
|
12
|
+
require_relative "top_secret/filters/regex"
|
13
|
+
require_relative "top_secret/error"
|
14
|
+
require_relative "top_secret/result"
|
15
|
+
require_relative "top_secret/text"
|
16
|
+
|
17
|
+
# TopSecret filters sensitive information from free text before it's sent to external services or APIs, such as Chatbots.
|
18
|
+
#
|
19
|
+
# @!attribute [rw] model_path
|
20
|
+
# @return [String] the path to the MITIE NER model
|
21
|
+
#
|
22
|
+
# @!attribute [rw] min_confidence_score
|
23
|
+
# @return [Float] the minimum confidence score required for NER matches
|
24
|
+
#
|
25
|
+
# @!attribute [rw] default_filters
|
26
|
+
# @return [ActiveSupport::OrderedOptions] a set of default filters used to identify sensitive data
|
27
|
+
module TopSecret
|
28
|
+
include ActiveSupport::Configurable
|
29
|
+
|
30
|
+
config_accessor :model_path, default: "ner_model.dat"
|
31
|
+
config_accessor :min_confidence_score, default: MIN_CONFIDENCE_SCORE
|
32
|
+
|
33
|
+
config_accessor :default_filters do
|
34
|
+
options = ActiveSupport::OrderedOptions.new
|
35
|
+
options.credit_card_filter = TopSecret::Filters::Regex.new(label: "CREDIT_CARD", regex: CREDIT_CARD_REGEX)
|
36
|
+
options.email_filter = TopSecret::Filters::Regex.new(label: "EMAIL", regex: EMAIL_REGEX)
|
37
|
+
options.phone_number_filter = TopSecret::Filters::Regex.new(label: "PHONE_NUMBER", regex: PHONE_REGEX)
|
38
|
+
options.ssn_filter = TopSecret::Filters::Regex.new(label: "SSN", regex: SSN_REGEX)
|
39
|
+
options.people_filter = TopSecret::Filters::NER.new(label: "PERSON", tag: :person)
|
40
|
+
options.location_filter = TopSecret::Filters::NER.new(label: "LOCATION", tag: :location)
|
41
|
+
|
42
|
+
options
|
43
|
+
end
|
44
|
+
end
|
data/sig/top_secret.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: top_secret
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Steve Polito
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: activesupport
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '8.0'
|
19
|
+
- - ">="
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 8.0.2
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - "~>"
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: '8.0'
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 8.0.2
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: mitie
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - "~>"
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 0.3.2
|
39
|
+
type: :runtime
|
40
|
+
prerelease: false
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - "~>"
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.3.2
|
46
|
+
description: Filter sensitive information from free text before sending it to external
|
47
|
+
services or APIs, such as Chatbots.
|
48
|
+
email:
|
49
|
+
- stevepolito@hey.com
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- CHANGELOG.md
|
55
|
+
- CODE_OF_CONDUCT.md
|
56
|
+
- LICENSE.txt
|
57
|
+
- README.md
|
58
|
+
- Rakefile
|
59
|
+
- lib/top_secret.rb
|
60
|
+
- lib/top_secret/constants.rb
|
61
|
+
- lib/top_secret/error.rb
|
62
|
+
- lib/top_secret/filters/ner.rb
|
63
|
+
- lib/top_secret/filters/regex.rb
|
64
|
+
- lib/top_secret/result.rb
|
65
|
+
- lib/top_secret/text.rb
|
66
|
+
- lib/top_secret/version.rb
|
67
|
+
- sig/top_secret.rbs
|
68
|
+
homepage: https://github.com/thoughtbot/top_secret
|
69
|
+
licenses:
|
70
|
+
- MIT
|
71
|
+
metadata:
|
72
|
+
homepage_uri: https://github.com/thoughtbot/top_secret
|
73
|
+
source_code_uri: https://github.com/thoughtbot/top_secret
|
74
|
+
changelog_uri: https://github.com/thoughtbot/top_secret/blob/main/CHANGELOG.md
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 3.2.0
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubygems_version: 3.6.9
|
90
|
+
specification_version: 4
|
91
|
+
summary: Filter sensitive information from free text.
|
92
|
+
test_files: []
|