top_secret 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/README.md +168 -3
- data/lib/top_secret/constants.rb +3 -0
- data/lib/top_secret/mapping.rb +15 -0
- data/lib/top_secret/null_model.rb +32 -0
- data/lib/top_secret/text/batch_result.rb +15 -18
- data/lib/top_secret/text/global_mapping.rb +63 -0
- data/lib/top_secret/text/result.rb +34 -1
- data/lib/top_secret/text/scan_result.rb +18 -0
- data/lib/top_secret/text.rb +77 -42
- data/lib/top_secret/version.rb +3 -1
- data/lib/top_secret.rb +2 -1
- metadata +13 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cef5c9c267dd71870fd244408a3f9a020d19978381810f99e7ed5defc67f12a7
|
4
|
+
data.tar.gz: ec20793792721a47371cc6a7cf3c687ad4284b9c9f1f9b14c57e002118c5532a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0eea9a1087e5082e245b73cdbb434543dbfcff277b6d58b7a89e463b1162215e0058f5190df3863a73fd32223fdf42fa65a253781193ea7e44ecc6b68c359e45
|
7
|
+
data.tar.gz: 6626056578ec3feecf27d843a994f782c144b363a6b7473e6e0be0fa80044c2283c329ae068c4f0f66af13f021743039864e13f220174489bd822e0c162872a7
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,26 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2025-09-19
|
4
|
+
|
5
|
+
### Added
|
6
|
+
|
7
|
+
- Added `TopSecret::Text.scan` method for detecting sensitive information without redacting text
|
8
|
+
- Added `TopSecret::Text::ScanResult` class to hold scan operation results with `mapping` and `sensitive?` methods
|
9
|
+
- Added `TopSecret::Text::GlobalMapping` class to manage consistent labeling across multiple filtering operations
|
10
|
+
- Added factory methods to domain objects: `BatchResult.from_messages`, `Result.from_messages`, and `Result.with_global_labels`
|
11
|
+
- Added support for disabling NER filtering by setting `model_path` to `nil` for improved performance and deployment flexibility
|
12
|
+
- Added support for Rails 7.0 and newer
|
13
|
+
- Added `#safe?` predicate method as the logical opposite of `#sensitive?` for `BatchResult`, `Result` and `ScanResult` classes
|
14
|
+
|
15
|
+
### Changed
|
16
|
+
|
17
|
+
- **BREAKING:** `TopSecret::Text.filter_all` now returns `TopSecret::Text::Result` objects instead of `TopSecret::Text::BatchResult::Item` objects for individual items
|
18
|
+
- Each item in `BatchResult#items` now includes an individual `mapping` attribute containing only the sensitive information found in that specific message
|
19
|
+
- `TopSecret::Text.filter_all` now only processes sensitive results when building global mappings, improving efficiency
|
20
|
+
- Refactored `TopSecret::Text.filter_all` to use domain objects with better separation of concerns and testability
|
21
|
+
- Improved performance by implementing lazy loading of MITIE model and document processing
|
22
|
+
- NER filtering now gracefully falls back when MITIE model is unavailable, continuing with regex-based filters only
|
23
|
+
|
3
24
|
## [0.2.0] - 2025-08-18
|
4
25
|
|
5
26
|
### Added
|
data/README.md
CHANGED
@@ -34,6 +34,13 @@ gem install top_secret
|
|
34
34
|
>
|
35
35
|
> You'll need to download and extract [ner_model.dat][] first.
|
36
36
|
|
37
|
+
> [!TIP]
|
38
|
+
> Due to its large size, you'll likely want to avoid committing [ner_model.dat][] into version control.
|
39
|
+
>
|
40
|
+
> You'll need to ensure the file exists in deployed environments. See relevant [discussion][discussions_60] for details.
|
41
|
+
>
|
42
|
+
> Alternatively, you can disable NER filtering entirely by setting `model_path` to `nil` if you only need regex-based filters (credit cards, emails, phone numbers, SSNs). This improves performance and eliminates the model file dependency.
|
43
|
+
|
37
44
|
By default, Top Secret assumes the file will live at the root of your project, but this can be configured.
|
38
45
|
|
39
46
|
```ruby
|
@@ -156,6 +163,81 @@ result.mapping
|
|
156
163
|
# => {:EMAIL_1=>"ralph@thoughtbot.com", :PERSON_1=>"Ralph"}
|
157
164
|
```
|
158
165
|
|
166
|
+
Check if sensitive information was found
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
result.sensitive?
|
170
|
+
|
171
|
+
# => true
|
172
|
+
|
173
|
+
result.safe?
|
174
|
+
|
175
|
+
# => false
|
176
|
+
```
|
177
|
+
|
178
|
+
### Scanning for Sensitive Information
|
179
|
+
|
180
|
+
Use `TopSecret::Text.scan` to detect sensitive information without redacting the text. This is useful when you only need to check if sensitive data exists or get a mapping of what was found:
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
TopSecret::Text.scan("Ralph can be reached at ralph@thoughtbot.com")
|
184
|
+
```
|
185
|
+
|
186
|
+
This will return
|
187
|
+
|
188
|
+
```ruby
|
189
|
+
<TopSecret::Text::ScanResult
|
190
|
+
@mapping={:EMAIL_1=>"ralph@thoughtbot.com", :PERSON_1=>"Ralph"}
|
191
|
+
>
|
192
|
+
```
|
193
|
+
|
194
|
+
Check if sensitive information was found
|
195
|
+
|
196
|
+
```ruby
|
197
|
+
result.sensitive?
|
198
|
+
|
199
|
+
# => true
|
200
|
+
|
201
|
+
result.safe?
|
202
|
+
|
203
|
+
# => false
|
204
|
+
```
|
205
|
+
|
206
|
+
View the mapping of found sensitive information
|
207
|
+
|
208
|
+
```ruby
|
209
|
+
result.mapping
|
210
|
+
|
211
|
+
# => {:EMAIL_1=>"ralph@thoughtbot.com", :PERSON_1=>"Ralph"}
|
212
|
+
```
|
213
|
+
|
214
|
+
The `scan` method accepts the same filter options as `filter`:
|
215
|
+
|
216
|
+
```ruby
|
217
|
+
# Override default filters
|
218
|
+
email_filter = TopSecret::Filters::Regex.new(
|
219
|
+
label: "EMAIL_ADDRESS",
|
220
|
+
regex: /\w+\[at\]\w+\.\w+/
|
221
|
+
)
|
222
|
+
result = TopSecret::Text.scan("Contact user[at]example.com", email_filter:)
|
223
|
+
result.mapping
|
224
|
+
# => {:EMAIL_ADDRESS_1=>"user[at]example.com"}
|
225
|
+
|
226
|
+
# Disable specific filters
|
227
|
+
result = TopSecret::Text.scan("Ralph works in Boston", people_filter: nil)
|
228
|
+
result.mapping
|
229
|
+
# => {:LOCATION_1=>"Boston"}
|
230
|
+
|
231
|
+
# Add custom filters
|
232
|
+
ip_filter = TopSecret::Filters::Regex.new(
|
233
|
+
label: "IP_ADDRESS",
|
234
|
+
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
|
235
|
+
)
|
236
|
+
result = TopSecret::Text.scan("Server IP is 192.168.1.1", custom_filters: [ip_filter])
|
237
|
+
result.mapping
|
238
|
+
# => {:IP_ADDRESS_1=>"192.168.1.1"}
|
239
|
+
```
|
240
|
+
|
159
241
|
### Batch Processing
|
160
242
|
|
161
243
|
When processing multiple messages, use `filter_all` to ensure consistent redaction labels across all messages:
|
@@ -176,9 +258,9 @@ This will return
|
|
176
258
|
<TopSecret::Text::BatchResult
|
177
259
|
@mapping={:EMAIL_1=>"ralph@thoughtbot.com", :EMAIL_2=>"ruby@thoughtbot.com"},
|
178
260
|
@items=[
|
179
|
-
<TopSecret::Text::
|
180
|
-
<TopSecret::Text::
|
181
|
-
<TopSecret::Text::
|
261
|
+
<TopSecret::Text::Result @input="Contact ralph@thoughtbot.com for details", @output="Contact [EMAIL_1] for details", @mapping={:EMAIL_1=>"ralph@thoughtbot.com"}>,
|
262
|
+
<TopSecret::Text::Result @input="Email ralph@thoughtbot.com again if needed", @output="Email [EMAIL_1] again if needed", @mapping={:EMAIL_1=>"ralph@thoughtbot.com"}>,
|
263
|
+
<TopSecret::Text::Result @input="Also CC ruby@thoughtbot.com on the thread", @output="Also CC [EMAIL_2] on the thread", @mapping={:EMAIL_2=>"ruby@thoughtbot.com"}>
|
182
264
|
]
|
183
265
|
>
|
184
266
|
```
|
@@ -199,10 +281,21 @@ result.items[0].input
|
|
199
281
|
|
200
282
|
result.items[0].output
|
201
283
|
# => "Contact [EMAIL_1] for details"
|
284
|
+
|
285
|
+
result.items[0].mapping
|
286
|
+
# => {:EMAIL_1=>"ralph@thoughtbot.com"}
|
287
|
+
|
288
|
+
result.items[0].sensitive?
|
289
|
+
# => true
|
290
|
+
|
291
|
+
result.items[0].safe?
|
292
|
+
# => false
|
202
293
|
```
|
203
294
|
|
204
295
|
The key benefit is that identical values receive the same labels across all messages - notice how `ralph@thoughtbot.com` becomes `[EMAIL_1]` in both the first and second messages.
|
205
296
|
|
297
|
+
Each item also maintains its own mapping containing only the sensitive information found in that specific message, while the batch result provides a global mapping of all sensitive information across all messages.
|
298
|
+
|
206
299
|
### Restoring Filtered Text
|
207
300
|
|
208
301
|
When external services (like LLMs) return responses containing filter placeholders, use `TopSecret::FilteredText.restore` to substitute them back with original values:
|
@@ -249,6 +342,61 @@ restore_result.unrestored
|
|
249
342
|
|
250
343
|
The restoration process tracks both successful and failed placeholder substitutions, allowing you to handle cases where the LLM response contains placeholders not found in your mapping.
|
251
344
|
|
345
|
+
### Working with LLMs
|
346
|
+
|
347
|
+
When sending filtered information to LLMs, they'll likely need to be instructed on how to handle those filters. Otherwise, we risk them not being returned in the response, which would break the restoration process.
|
348
|
+
|
349
|
+
Here's a recommended approach:
|
350
|
+
|
351
|
+
```ruby
|
352
|
+
instructions = <<~TEXT
|
353
|
+
I'm going to send filtered information to you in the form of free text.
|
354
|
+
If you need to refer to the filtered information in a response, just reference it by the filter.
|
355
|
+
TEXT
|
356
|
+
```
|
357
|
+
|
358
|
+
Complete example:
|
359
|
+
|
360
|
+
```ruby
|
361
|
+
require "openai"
|
362
|
+
require "top_secret"
|
363
|
+
|
364
|
+
openai = OpenAI::Client.new(
|
365
|
+
api_key: Rails.application.credentials.openai.api_key!
|
366
|
+
)
|
367
|
+
|
368
|
+
original_messages = [
|
369
|
+
"Ralph lives in Boston.",
|
370
|
+
"You can reach them at ralph@thoughtbot.com or 877-976-2687"
|
371
|
+
]
|
372
|
+
|
373
|
+
# Filter all messages
|
374
|
+
result = TopSecret::Text.filter_all(original_messages)
|
375
|
+
filtered_messages = result.items.map(&:output)
|
376
|
+
|
377
|
+
user_messages = filtered_messages.map { {role: "user", content: it} }
|
378
|
+
|
379
|
+
# Instruct LLM how to handle filtered messages
|
380
|
+
instructions = <<~TEXT
|
381
|
+
I'm going to send filtered information to you in the form of free text.
|
382
|
+
If you need to refer to the filtered information in a response, just reference it by the filter.
|
383
|
+
TEXT
|
384
|
+
|
385
|
+
messages = [
|
386
|
+
{role: "system", content: instructions},
|
387
|
+
*user_messages
|
388
|
+
]
|
389
|
+
|
390
|
+
chat_completion = openai.chat.completions.create(messages:, model: :"gpt-5")
|
391
|
+
response = chat_completion.choices.last.message.content
|
392
|
+
|
393
|
+
# Restore the response from the mapping
|
394
|
+
mapping = result.mapping
|
395
|
+
restored_response = TopSecret::FilteredText.restore(response, mapping:).output
|
396
|
+
|
397
|
+
puts(restored_response)
|
398
|
+
```
|
399
|
+
|
252
400
|
### Advanced Examples
|
253
401
|
|
254
402
|
#### Overriding the default filters
|
@@ -420,6 +568,22 @@ TopSecret.configure do |config|
|
|
420
568
|
end
|
421
569
|
```
|
422
570
|
|
571
|
+
### Disabling NER filtering
|
572
|
+
|
573
|
+
For improved performance or when the MITIE model file cannot be deployed, you can disable NER-based filtering entirely. This will disable people and location detection but retain all regex-based filters (credit cards, emails, phone numbers, SSNs):
|
574
|
+
|
575
|
+
```ruby
|
576
|
+
TopSecret.configure do |config|
|
577
|
+
config.model_path = nil
|
578
|
+
end
|
579
|
+
```
|
580
|
+
|
581
|
+
This is useful in environments where:
|
582
|
+
|
583
|
+
- The model file cannot be deployed due to size constraints
|
584
|
+
- You only need regex-based filtering
|
585
|
+
- You want to optimize for performance over NER capabilities
|
586
|
+
|
423
587
|
### Overriding the confidence score
|
424
588
|
|
425
589
|
```ruby
|
@@ -523,3 +687,4 @@ We are [available for hire][hire].
|
|
523
687
|
[train]: https://github.com/ankane/mitie-ruby?tab=readme-ov-file#training
|
524
688
|
[Regex filters]: https://github.com/thoughtbot/top_secret/blob/main/lib/top_secret/filters/regex.rb
|
525
689
|
[NER filters]: https://github.com/thoughtbot/top_secret/blob/main/lib/top_secret/filters/ner.rb
|
690
|
+
[discussions_60]: https://github.com/thoughtbot/top_secret/discussions/60
|
data/lib/top_secret/constants.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
module Mapping
|
5
|
+
# @return [Boolean] Whether sensitive information was found
|
6
|
+
def sensitive?
|
7
|
+
mapping.any?
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [Boolean] Whether sensitive information was not found
|
11
|
+
def safe?
|
12
|
+
!sensitive?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
# A null object implementation that provides a no-op interface compatible with Mitie::NER.
|
5
|
+
# Used when NER filtering is disabled (model_path is nil) to eliminate conditional checks
|
6
|
+
# throughout the codebase.
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
# model = TopSecret::NullModel.new
|
10
|
+
# doc = model.doc("some text")
|
11
|
+
# doc.entities # => []
|
12
|
+
class NullModel
|
13
|
+
# A null document implementation that provides an empty entities array.
|
14
|
+
# Used as the return value from NullModel#doc to maintain interface compatibility.
|
15
|
+
class NullDoc
|
16
|
+
# Returns an empty array of entities.
|
17
|
+
#
|
18
|
+
# @return [Array] Always returns an empty array
|
19
|
+
def entities
|
20
|
+
[]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Creates a null document that returns empty entities.
|
25
|
+
#
|
26
|
+
# @param input [String] The input text (ignored)
|
27
|
+
# @return [NullDoc] A document-like object with empty entities
|
28
|
+
def doc(input)
|
29
|
+
NullDoc.new
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -5,7 +5,9 @@ module TopSecret
|
|
5
5
|
# Holds the result of a batch redaction operation on multiple messages.
|
6
6
|
# Contains a global mapping that ensures consistent labeling across all messages
|
7
7
|
# and a collection of individual input/output pairs.
|
8
|
-
class BatchResult
|
8
|
+
class BatchResult # TODO Rename to FilterBatchResult
|
9
|
+
include Mapping
|
10
|
+
|
9
11
|
# @return [Hash] Global mapping of redaction labels to original values across all messages
|
10
12
|
attr_reader :mapping
|
11
13
|
|
@@ -21,24 +23,19 @@ module TopSecret
|
|
21
23
|
@items = items
|
22
24
|
end
|
23
25
|
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
26
|
+
# Creates a BatchResult from multiple messages with consistent global labeling
|
27
|
+
#
|
28
|
+
# @param messages [Array<String>] Array of text messages to filter
|
29
|
+
# @param custom_filters [Array] Additional custom filters to apply
|
30
|
+
# @param filters [Hash] Optional filters to override defaults (only valid filter keys accepted)
|
31
|
+
# @return [BatchResult] Contains global mapping and array of Result objects with individual mappings
|
32
|
+
# @raise [ArgumentError] If invalid filter keys are provided
|
33
|
+
def self.from_messages(messages, custom_filters: [], **filters)
|
34
|
+
individual_results = TopSecret::Text::Result.from_messages(messages, custom_filters:, **filters)
|
35
|
+
mapping = TopSecret::Text::GlobalMapping.from_results(individual_results)
|
36
|
+
items = TopSecret::Text::Result.with_global_labels(individual_results, mapping)
|
33
37
|
|
34
|
-
|
35
|
-
#
|
36
|
-
# @param input [String] The original text
|
37
|
-
# @param output [String] The redacted text
|
38
|
-
def initialize(input, output)
|
39
|
-
@input = input
|
40
|
-
@output = output
|
41
|
-
end
|
38
|
+
Text::BatchResult.new(mapping:, items:)
|
42
39
|
end
|
43
40
|
end
|
44
41
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
class Text
|
5
|
+
# Manages consistent labeling across multiple filtering operations by ensuring
|
6
|
+
# identical sensitive values receive the same redaction labels globally.
|
7
|
+
class GlobalMapping
|
8
|
+
# Creates a global mapping from individual filter results
|
9
|
+
#
|
10
|
+
# @param individual_results [Array<Result>] Array of individual filter results
|
11
|
+
# @return [Hash] Inverted mapping from filter labels to original values
|
12
|
+
def self.from_results(individual_results)
|
13
|
+
new.build_from_results(individual_results)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Creates a new GlobalMapping instance
|
17
|
+
def initialize
|
18
|
+
@mapping = {}
|
19
|
+
@label_counters = {}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Builds the global mapping by processing all individual results
|
23
|
+
#
|
24
|
+
# @param individual_results [Array<Result>] Array of individual filter results
|
25
|
+
# @return [Hash] Inverted mapping from filter labels to original values
|
26
|
+
def build_from_results(individual_results)
|
27
|
+
individual_results.each { |result| process_result(result) if result.sensitive? }
|
28
|
+
|
29
|
+
mapping.invert
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
attr_reader :mapping
|
35
|
+
attr_reader :label_counters
|
36
|
+
|
37
|
+
# Processes a single result, adding new values to the global mapping
|
38
|
+
#
|
39
|
+
# @param result [Result] Individual filter result to process
|
40
|
+
def process_result(result)
|
41
|
+
result.mapping.each do |individual_key, value|
|
42
|
+
next if mapping.key?(value)
|
43
|
+
|
44
|
+
mapping[value] = generate_global_key(individual_key)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Generates a consistent global key for a given individual key
|
49
|
+
#
|
50
|
+
# @param individual_key [Symbol] The individual key from a filter result
|
51
|
+
# @return [Symbol] The global key with consistent numbering
|
52
|
+
def generate_global_key(individual_key)
|
53
|
+
# TODO: This assumes labels are formatted consistently.
|
54
|
+
# We need to account for the following for the case where a label could begin with an "_"
|
55
|
+
label_type = individual_key.to_s.rpartition("_").first
|
56
|
+
|
57
|
+
label_counters[label_type] ||= 0
|
58
|
+
label_counters[label_type] += 1
|
59
|
+
:"#{label_type}_#{label_counters[label_type]}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -3,7 +3,9 @@
|
|
3
3
|
module TopSecret
|
4
4
|
class Text
|
5
5
|
# Holds the result of a redaction operation.
|
6
|
-
class Result
|
6
|
+
class Result # TODO: Rename to FilterResult
|
7
|
+
include Mapping
|
8
|
+
|
7
9
|
# @return [String] The original unredacted input
|
8
10
|
attr_reader :input
|
9
11
|
|
@@ -21,6 +23,37 @@ module TopSecret
|
|
21
23
|
@output = output
|
22
24
|
@mapping = mapping
|
23
25
|
end
|
26
|
+
|
27
|
+
# Filters multiple messages individually using a shared model for performance
|
28
|
+
#
|
29
|
+
# @param messages [Array<String>] Array of text messages to filter
|
30
|
+
# @param custom_filters [Array] Additional custom filters to apply
|
31
|
+
# @param filters [Hash] Optional filters to override defaults (only valid filter keys accepted)
|
32
|
+
# @return [Array<Result>] Array of individual Result objects for each message
|
33
|
+
# @raise [ArgumentError] If invalid filter keys are provided
|
34
|
+
def self.from_messages(messages, custom_filters: [], **filters)
|
35
|
+
shared_model = TopSecret.model_path ? Mitie::NER.new(TopSecret.model_path) : nil
|
36
|
+
|
37
|
+
messages.map do |message|
|
38
|
+
TopSecret::Text.new(message, filters:, custom_filters:, model: shared_model).filter
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Creates Result objects with globally consistent labels applied to text
|
43
|
+
#
|
44
|
+
# @param individual_results [Array<Result>] Array of individual filter results
|
45
|
+
# @param global_mapping [Hash] Global mapping from filter labels to original values
|
46
|
+
# @return [Array<Result>] Array of Result objects with globally consistent redaction and individual mappings
|
47
|
+
def self.with_global_labels(individual_results, global_mapping)
|
48
|
+
individual_results.map do |result|
|
49
|
+
output = global_mapping.reduce(result.input.dup) do |text, (filter, value)|
|
50
|
+
text.gsub(value, "[#{filter}]")
|
51
|
+
end
|
52
|
+
filter_keys = output.scan(/\[([^\]]+)\]/).flatten.map(&:to_sym)
|
53
|
+
mapping = global_mapping.slice(*filter_keys)
|
54
|
+
new(result.input, output, mapping)
|
55
|
+
end
|
56
|
+
end
|
24
57
|
end
|
25
58
|
end
|
26
59
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TopSecret
|
4
|
+
class Text
|
5
|
+
# Holds the result of a scan operation.
|
6
|
+
class ScanResult
|
7
|
+
include Mapping
|
8
|
+
|
9
|
+
# @return [Hash] Mapping of redacted labels to matched values
|
10
|
+
attr_reader :mapping
|
11
|
+
|
12
|
+
# @param mapping [Hash] Map of labels to matched values
|
13
|
+
def initialize(mapping)
|
14
|
+
@mapping = mapping
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/top_secret/text.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "active_support/core_ext/hash/keys"
|
4
|
+
require_relative "null_model"
|
4
5
|
require_relative "text/result"
|
5
6
|
require_relative "text/batch_result"
|
7
|
+
require_relative "text/scan_result"
|
8
|
+
require_relative "text/global_mapping"
|
6
9
|
|
7
10
|
module TopSecret
|
8
11
|
# Processes text to identify and redact sensitive information using configured filters.
|
@@ -16,9 +19,7 @@ module TopSecret
|
|
16
19
|
@output = input.dup
|
17
20
|
@mapping = {}
|
18
21
|
|
19
|
-
@model = model ||
|
20
|
-
@doc = @model.doc(@output)
|
21
|
-
@entities = @doc.entities
|
22
|
+
@model = model || default_model
|
22
23
|
|
23
24
|
@filters = filters
|
24
25
|
@custom_filters = custom_filters
|
@@ -44,7 +45,7 @@ module TopSecret
|
|
44
45
|
# @param messages [Array<String>] Array of text messages to filter
|
45
46
|
# @param custom_filters [Array] Additional custom filters to apply
|
46
47
|
# @param filters [Hash] Optional filters to override defaults (only valid filter keys accepted)
|
47
|
-
# @return [BatchResult] Contains global mapping and array of
|
48
|
+
# @return [BatchResult] Contains global mapping and array of Result objects with individual mappings
|
48
49
|
# @raise [ArgumentError] If invalid filter keys are provided
|
49
50
|
#
|
50
51
|
# @example Basic usage
|
@@ -52,54 +53,59 @@ module TopSecret
|
|
52
53
|
# result = TopSecret::Text.filter_all(messages)
|
53
54
|
# result.items[0].output # => "Contact [EMAIL_1]"
|
54
55
|
# result.items[1].output # => "Email [EMAIL_1] again"
|
56
|
+
# result.items[0].mapping # => { EMAIL_1: "john@test.com" }
|
55
57
|
# result.mapping # => { EMAIL_1: "john@test.com" }
|
56
58
|
#
|
57
59
|
# @example With custom filters
|
58
60
|
# ip_filter = TopSecret::Filters::Regex.new(label: "IP", regex: /\d+\.\d+\.\d+\.\d+/)
|
59
61
|
# result = TopSecret::Text.filter_all(messages, custom_filters: [ip_filter])
|
60
62
|
def self.filter_all(messages, custom_filters: [], **filters)
|
61
|
-
|
62
|
-
|
63
|
-
individual_results = messages.map do |message|
|
64
|
-
new(message, filters:, custom_filters:, model: shared_model).filter
|
65
|
-
end
|
66
|
-
|
67
|
-
global_mapping = {}
|
68
|
-
label_counters = {}
|
69
|
-
|
70
|
-
individual_results.each do |result|
|
71
|
-
result.mapping.each do |individual_key, value|
|
72
|
-
next if global_mapping.key?(value)
|
73
|
-
|
74
|
-
# TODO: This assumes labels are formatted consistently.
|
75
|
-
# We need to account for the following for the case where a label could begin with an "_"
|
76
|
-
label_type = individual_key.to_s.rpartition("_").first
|
77
|
-
|
78
|
-
label_counters[label_type] ||= 0
|
79
|
-
label_counters[label_type] += 1
|
80
|
-
global_key = :"#{label_type}_#{label_counters[label_type]}"
|
81
|
-
|
82
|
-
global_mapping[value] = global_key
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
inverted_global_mapping = global_mapping.invert
|
87
|
-
|
88
|
-
items = individual_results.map do |result|
|
89
|
-
output = result.input.dup
|
90
|
-
inverted_global_mapping.each { |filter, value| output.gsub!(value, "[#{filter}]") }
|
91
|
-
Text::BatchResult::Item.new(result.input, output)
|
92
|
-
end
|
63
|
+
Text::BatchResult.from_messages(messages, custom_filters:, **filters)
|
64
|
+
end
|
93
65
|
|
94
|
-
|
66
|
+
# Convenience method to scan input text for sensitive information without redacting it
|
67
|
+
#
|
68
|
+
# This method detects sensitive information using configured filters but does not modify
|
69
|
+
# the original text. Use this when you only need to check if sensitive data exists or
|
70
|
+
# get a mapping of what was found.
|
71
|
+
#
|
72
|
+
# @param input [String] The text to scan for sensitive information
|
73
|
+
# @param filters [Hash] Optional filters to override defaults (only valid filter keys accepted)
|
74
|
+
# @param custom_filters [Array] Additional custom filters to apply
|
75
|
+
# @return [ScanResult] Contains mapping of found sensitive information and sensitive? flag
|
76
|
+
# @raise [ArgumentError] If invalid filter keys are provided
|
77
|
+
#
|
78
|
+
# @example Basic scanning
|
79
|
+
# result = TopSecret::Text.scan("Contact john@example.com")
|
80
|
+
# result.sensitive? # => true
|
81
|
+
# result.mapping # => {:EMAIL_1=>"john@example.com"}
|
82
|
+
#
|
83
|
+
# @example With custom filters
|
84
|
+
# ip_filter = TopSecret::Filters::Regex.new(label: "IP", regex: /\d+\.\d+\.\d+\.\d+/)
|
85
|
+
# result = TopSecret::Text.scan("Server IP: 192.168.1.1", custom_filters: [ip_filter])
|
86
|
+
# result.mapping # => {:IP_1=>"192.168.1.1"}
|
87
|
+
#
|
88
|
+
# @example Overriding default filters
|
89
|
+
# custom_email = TopSecret::Filters::Regex.new(label: "EMAIL_ADDR", regex: /\w+@\w+/)
|
90
|
+
# result = TopSecret::Text.scan("user@test.com", email_filter: custom_email)
|
91
|
+
# result.mapping # => {:EMAIL_ADDR_1=>"user@test.com"}
|
92
|
+
def self.scan(input, custom_filters: [], **filters)
|
93
|
+
new(input, filters:, custom_filters:).scan
|
95
94
|
end
|
96
95
|
|
97
|
-
#
|
96
|
+
# Scans the input text for sensitive information using configured filters
|
98
97
|
#
|
99
|
-
#
|
98
|
+
# This method applies all active filters to detect sensitive information but does not
|
99
|
+
# redact the original text. It builds a mapping of found values and returns whether
|
100
|
+
# any sensitive information was detected.
|
101
|
+
#
|
102
|
+
# @return [ScanResult] Contains mapping of found sensitive information and sensitive? flag
|
100
103
|
# @raise [Error] If an unsupported filter is encountered
|
101
104
|
# @raise [ArgumentError] If invalid filter keys are provided
|
102
|
-
def
|
105
|
+
def scan
|
106
|
+
@doc ||= model.doc(@output) if model
|
107
|
+
@entities ||= doc.entities if model
|
108
|
+
|
103
109
|
validate_filters!
|
104
110
|
|
105
111
|
all_filters.each do |filter|
|
@@ -116,9 +122,20 @@ module TopSecret
|
|
116
122
|
build_mapping(values, label: filter.label)
|
117
123
|
end
|
118
124
|
|
119
|
-
|
125
|
+
ScanResult.new(mapping)
|
126
|
+
end
|
120
127
|
|
121
|
-
|
128
|
+
# Applies configured filters to the input, redacting matches and building a mapping.
|
129
|
+
#
|
130
|
+
# @return [Result] Contains original input, redacted output, and mapping of labels to values
|
131
|
+
# @raise [Error] If an unsupported filter is encountered
|
132
|
+
# @raise [ArgumentError] If invalid filter keys are provided
|
133
|
+
def filter
|
134
|
+
scan_result = scan
|
135
|
+
|
136
|
+
substitute_text if scan_result.sensitive?
|
137
|
+
|
138
|
+
Text::Result.new(input, output, scan_result.mapping)
|
122
139
|
end
|
123
140
|
|
124
141
|
private
|
@@ -132,6 +149,12 @@ module TopSecret
|
|
132
149
|
# @return [Hash] Mapping from redaction labels to original values
|
133
150
|
attr_reader :mapping
|
134
151
|
|
152
|
+
# @return [Object] The NER model (typically Mitie::NER or a test double)
|
153
|
+
attr_reader :model
|
154
|
+
|
155
|
+
# @return [Object] The document created from the output text (typically Mitie::Document or a test double)
|
156
|
+
attr_reader :doc
|
157
|
+
|
135
158
|
# @return [Array<Hash>] Named entities extracted by MITIE
|
136
159
|
attr_reader :entities
|
137
160
|
|
@@ -199,5 +222,17 @@ module TopSecret
|
|
199
222
|
location_filter: TopSecret.location_filter
|
200
223
|
}
|
201
224
|
end
|
225
|
+
|
226
|
+
# Creates the default model based on configuration.
|
227
|
+
# Returns a MITIE NER model if a model path is configured, otherwise returns a null model.
|
228
|
+
#
|
229
|
+
# @return [Mitie::NER, NullModel] The model instance to use for NER processing
|
230
|
+
def default_model
|
231
|
+
if TopSecret.model_path
|
232
|
+
Mitie::NER.new(TopSecret.model_path)
|
233
|
+
else
|
234
|
+
NullModel.new
|
235
|
+
end
|
236
|
+
end
|
202
237
|
end
|
203
238
|
end
|
data/lib/top_secret/version.rb
CHANGED
data/lib/top_secret.rb
CHANGED
@@ -8,6 +8,7 @@ require "mitie"
|
|
8
8
|
# modules
|
9
9
|
require_relative "top_secret/version"
|
10
10
|
require_relative "top_secret/constants"
|
11
|
+
require_relative "top_secret/mapping"
|
11
12
|
require_relative "top_secret/filters/ner"
|
12
13
|
require_relative "top_secret/filters/regex"
|
13
14
|
require_relative "top_secret/error"
|
@@ -45,7 +46,7 @@ require_relative "top_secret/filtered_text"
|
|
45
46
|
module TopSecret
|
46
47
|
include ActiveSupport::Configurable
|
47
48
|
|
48
|
-
config_accessor :model_path, default:
|
49
|
+
config_accessor :model_path, default: MODEL_PATH
|
49
50
|
config_accessor :min_confidence_score, default: MIN_CONFIDENCE_SCORE
|
50
51
|
|
51
52
|
config_accessor :custom_filters, default: []
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: top_secret
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Polito
|
@@ -13,22 +13,22 @@ dependencies:
|
|
13
13
|
name: activesupport
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
15
15
|
requirements:
|
16
|
-
- - "~>"
|
17
|
-
- !ruby/object:Gem::Version
|
18
|
-
version: '8.0'
|
19
16
|
- - ">="
|
20
17
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
18
|
+
version: 7.0.8
|
19
|
+
- - "<"
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '9'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
requirements:
|
26
|
-
- - "~>"
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
version: '8.0'
|
29
26
|
- - ">="
|
30
27
|
- !ruby/object:Gem::Version
|
31
|
-
version:
|
28
|
+
version: 7.0.8
|
29
|
+
- - "<"
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: '9'
|
32
32
|
- !ruby/object:Gem::Dependency
|
33
33
|
name: mitie
|
34
34
|
requirement: !ruby/object:Gem::Requirement
|
@@ -64,9 +64,13 @@ files:
|
|
64
64
|
- lib/top_secret/filtered_text/result.rb
|
65
65
|
- lib/top_secret/filters/ner.rb
|
66
66
|
- lib/top_secret/filters/regex.rb
|
67
|
+
- lib/top_secret/mapping.rb
|
68
|
+
- lib/top_secret/null_model.rb
|
67
69
|
- lib/top_secret/text.rb
|
68
70
|
- lib/top_secret/text/batch_result.rb
|
71
|
+
- lib/top_secret/text/global_mapping.rb
|
69
72
|
- lib/top_secret/text/result.rb
|
73
|
+
- lib/top_secret/text/scan_result.rb
|
70
74
|
- lib/top_secret/version.rb
|
71
75
|
- sig/top_secret.rbs
|
72
76
|
homepage: https://github.com/thoughtbot/top_secret
|