top_secret 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +40 -0
- data/README.md +156 -3
- data/lib/top_secret/category.rb +120 -0
- data/lib/top_secret/constants.rb +3 -0
- data/lib/top_secret/error.rb +3 -1
- data/lib/top_secret/mapping.rb +72 -0
- data/lib/top_secret/text/global_mapping.rb +4 -17
- data/lib/top_secret/text/label_sequence.rb +28 -0
- data/lib/top_secret/text.rb +70 -6
- data/lib/top_secret/version.rb +1 -1
- data/lib/top_secret.rb +16 -12
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e4bff6502396266520ec0ff99e2f413fd70301aa44c7a113e23d231466072860
|
|
4
|
+
data.tar.gz: 3fdd3c37f08538cce48f49cd90faa85717d8d06b5701ecbb763c353e08ad3060
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7630a299ea11ef420819088aa4edc1d4c12314f286b0ecc9fe4eecaa4f6392306856de65489bc6b8c609275dd044a406d172b58fcf04b9d7fa537890131f653c
|
|
7
|
+
data.tar.gz: 9c21a7ca0c5138da5132b002c4cb4e34d6824cc27b148a253bc1e9f01efdcd3152e8362455d909c42cf26eb0be5a58c695f26918cdc8d4222dcccd9ffdd660b2
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,45 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [1.0.0] - 2026-03-25
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Added category methods to `Result` for querying specific types of sensitive information (e.g., `emails`, `emails?`, `email_mapping`)
|
|
8
|
+
- Category methods are automatically generated for all default filter types and custom labels
|
|
9
|
+
- Category methods always return empty arrays/hashes when no data of that type is found, ensuring they're safe to call without checking
|
|
10
|
+
- Custom labels are matched exactly — `EMAIL_ADDRESS` methods are distinct from default `EMAIL` methods
|
|
11
|
+
- Added `categories` method to `Result` for listing category types that have matches in the mapping
|
|
12
|
+
|
|
13
|
+
### Changed
|
|
14
|
+
|
|
15
|
+
- **BREAKING:** Added strict label validation for custom filters. Labels must now start and end with letters and contain only alphabetic characters and single underscores (no consecutive underscores, digits, or special characters). Previously malformed labels will now raise `Error::MalformedLabel`.
|
|
16
|
+
- Replaced `ActiveSupport::Configurable` with `mattr_accessor` for upcoming `Configurable` deprecation in Rails 8.2.
|
|
17
|
+
|
|
18
|
+
### Migration Guide
|
|
19
|
+
|
|
20
|
+
If you have custom filters with malformed labels, update them to meet the new requirements:
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
# Before (invalid)
|
|
24
|
+
TopSecret::Filters::Regex.new(label: "EMAIL_ADDRESS_1", regex: /.../)
|
|
25
|
+
TopSecret::Filters::Regex.new(label: "_EMAIL", regex: /.../)
|
|
26
|
+
TopSecret::Filters::Regex.new(label: "EMAIL1", regex: /.../)
|
|
27
|
+
|
|
28
|
+
# After (valid)
|
|
29
|
+
TopSecret::Filters::Regex.new(label: "EMAIL_ADDRESS", regex: /.../)
|
|
30
|
+
TopSecret::Filters::Regex.new(label: "EMAIL", regex: /.../)
|
|
31
|
+
TopSecret::Filters::Regex.new(label: "EMAIL", regex: /.../)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Note: The `_N` suffix is appended automatically by the system during mapping.
|
|
35
|
+
|
|
36
|
+
## [0.4.0] - 2025-10-31
|
|
37
|
+
|
|
38
|
+
### Added
|
|
39
|
+
|
|
40
|
+
- Added automatic caching of MITIE NER model to improve performance by avoiding expensive reinitialization
|
|
41
|
+
- Added `TopSecret::Text.clear_model_cache!` method to clear the cached model when needed
|
|
42
|
+
|
|
3
43
|
## [0.3.0] - 2025-09-19
|
|
4
44
|
|
|
5
45
|
### Added
|
data/README.md
CHANGED
|
@@ -35,9 +35,7 @@ gem install top_secret
|
|
|
35
35
|
> You'll need to download and extract [ner_model.dat][] first.
|
|
36
36
|
|
|
37
37
|
> [!TIP]
|
|
38
|
-
> Due to its large size, you'll likely want to avoid committing [ner_model.dat][] into version control.
|
|
39
|
-
>
|
|
40
|
-
> You'll need to ensure the file exists in deployed environments. See relevant [discussion][discussions_60] for details.
|
|
38
|
+
> Due to its large size, you'll likely want to avoid committing [ner_model.dat][] into version control. See the [Production](#production) section for details on using [Trove][] to deploy the model file.
|
|
41
39
|
>
|
|
42
40
|
> Alternatively, you can disable NER filtering entirely by setting `model_path` to `nil` if you only need regex-based filters (credit cards, emails, phone numbers, SSNs). This improves performance and eliminates the model file dependency.
|
|
43
41
|
|
|
@@ -175,6 +173,104 @@ result.safe?
|
|
|
175
173
|
# => false
|
|
176
174
|
```
|
|
177
175
|
|
|
176
|
+
### Category Methods
|
|
177
|
+
|
|
178
|
+
Query the result for specific types of sensitive information using category methods:
|
|
179
|
+
|
|
180
|
+
```ruby
|
|
181
|
+
result = TopSecret::Text.filter("Ralph can be reached at ralph@example.com or 555-1234")
|
|
182
|
+
|
|
183
|
+
# Check if emails were found
|
|
184
|
+
result.emails?
|
|
185
|
+
# => true
|
|
186
|
+
|
|
187
|
+
# Get all emails
|
|
188
|
+
result.emails
|
|
189
|
+
# => ["ralph@example.com"]
|
|
190
|
+
|
|
191
|
+
# Get email mapping
|
|
192
|
+
result.email_mapping
|
|
193
|
+
# => {:EMAIL_1=>"ralph@example.com"}
|
|
194
|
+
|
|
195
|
+
# Similarly for other types
|
|
196
|
+
result.people? # => true
|
|
197
|
+
result.people # => ["Ralph"]
|
|
198
|
+
result.person_mapping # => {:PERSON_1=>"Ralph"}
|
|
199
|
+
|
|
200
|
+
result.phone_numbers? # => true
|
|
201
|
+
result.phone_numbers # => ["555-1234"]
|
|
202
|
+
result.phone_number_mapping # => {:PHONE_NUMBER_1=>"555-1234"}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Available category methods for all default filters:
|
|
206
|
+
|
|
207
|
+
- `emails`, `emails?`, `email_mapping`
|
|
208
|
+
- `credit_cards`, `credit_cards?`, `credit_card_mapping`
|
|
209
|
+
- `phone_numbers`, `phone_numbers?`, `phone_number_mapping`
|
|
210
|
+
- `ssns`, `ssns?`, `ssn_mapping`
|
|
211
|
+
- `people`, `people?`, `person_mapping`
|
|
212
|
+
- `locations`, `locations?`, `location_mapping`
|
|
213
|
+
|
|
214
|
+
Use `categories` to see which types were found in the result:
|
|
215
|
+
|
|
216
|
+
```ruby
|
|
217
|
+
result = TopSecret::Text.filter("Ralph can be reached at ralph@example.com")
|
|
218
|
+
|
|
219
|
+
result.categories
|
|
220
|
+
# => [:email, :person]
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
These methods are always available and return empty arrays/hashes when no sensitive information of that type is found:
|
|
224
|
+
|
|
225
|
+
```ruby
|
|
226
|
+
result = TopSecret::Text.filter("No sensitive data here")
|
|
227
|
+
|
|
228
|
+
result.emails? # => false
|
|
229
|
+
result.emails # => []
|
|
230
|
+
result.email_mapping # => {}
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
When using custom labels, methods are generated based on the label name. Each label is matched exactly — a custom label like `EMAIL_ADDRESS` produces its own set of methods, separate from the default `EMAIL` methods:
|
|
234
|
+
|
|
235
|
+
```ruby
|
|
236
|
+
result = TopSecret::Text.filter(
|
|
237
|
+
"user[at]example.com",
|
|
238
|
+
email_filter: TopSecret::Filters::Regex.new(
|
|
239
|
+
label: "EMAIL_ADDRESS",
|
|
240
|
+
regex: /\w+\[at\]\w+\.\w+/
|
|
241
|
+
)
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Methods are derived from the custom label (EMAIL_ADDRESS)
|
|
245
|
+
result.email_addresses # => ["user[at]example.com"]
|
|
246
|
+
result.email_addresses? # => true
|
|
247
|
+
result.email_address_mapping # => {:EMAIL_ADDRESS_1=>"user[at]example.com"}
|
|
248
|
+
|
|
249
|
+
# Default email methods only match the default EMAIL label, not EMAIL_ADDRESS
|
|
250
|
+
result.emails # => []
|
|
251
|
+
result.email_mapping # => {}
|
|
252
|
+
|
|
253
|
+
# If the custom label matches the default, both refer to the same data
|
|
254
|
+
result = TopSecret::Text.filter(
|
|
255
|
+
"user[at]example.com",
|
|
256
|
+
email_filter: TopSecret::Filters::Regex.new(
|
|
257
|
+
label: "EMAIL",
|
|
258
|
+
regex: /\w+\[at\]\w+\.\w+/
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
result.emails # => ["user[at]example.com"]
|
|
263
|
+
result.email_mapping # => {:EMAIL_1=>"user[at]example.com"}
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
When a custom label ends in `_MAPPING` (e.g., `NETWORK_MAPPING`), the mapping method appends `_mapping` to the pluralized form to keep the naming pattern consistent:
|
|
267
|
+
|
|
268
|
+
```ruby
|
|
269
|
+
result.network_mappings # => ["10.0.1.0/24 -> 192.168.1.0/24"]
|
|
270
|
+
result.network_mappings? # => true
|
|
271
|
+
result.network_mappings_mapping # => {:NETWORK_MAPPING_1=>"10.0.1.0/24 -> 192.168.1.0/24"}
|
|
272
|
+
```
|
|
273
|
+
|
|
178
274
|
### Scanning for Sensitive Information
|
|
179
275
|
|
|
180
276
|
Use `TopSecret::Text.scan` to detect sensitive information without redacting the text. This is useful when you only need to check if sensitive data exists or get a mapping of what was found:
|
|
@@ -455,6 +551,20 @@ TopSecret::Text.filter("some text", invalid_filter: some_filter)
|
|
|
455
551
|
|
|
456
552
|
### Custom Filters
|
|
457
553
|
|
|
554
|
+
> [!IMPORTANT]
|
|
555
|
+
> Custom filter labels must follow these rules:
|
|
556
|
+
>
|
|
557
|
+
> - Start with a letter (a-z, A-Z)
|
|
558
|
+
> - End with a letter (a-z, A-Z)
|
|
559
|
+
> - Contain only letters and single underscores (no consecutive underscores)
|
|
560
|
+
> - Cannot contain digits or special characters
|
|
561
|
+
>
|
|
562
|
+
> Valid examples: `EMAIL`, `IP_ADDRESS`, `CREDIT_CARD`
|
|
563
|
+
>
|
|
564
|
+
> Invalid examples: `_EMAIL` (starts with underscore), `EMAIL_` (ends with underscore), `EMAIL1` (ends with digit), `EMAIL__ADDRESS` (consecutive underscores)
|
|
565
|
+
>
|
|
566
|
+
> The system automatically appends `_N` where N is the sequence number (e.g., `EMAIL_ADDRESS` becomes `EMAIL_ADDRESS_1`, `EMAIL_ADDRESS_2`, etc.)
|
|
567
|
+
|
|
458
568
|
#### Adding new [Regex filters][]
|
|
459
569
|
|
|
460
570
|
```ruby
|
|
@@ -568,6 +678,16 @@ TopSecret.configure do |config|
|
|
|
568
678
|
end
|
|
569
679
|
```
|
|
570
680
|
|
|
681
|
+
### Model caching
|
|
682
|
+
|
|
683
|
+
The MITIE NER model is automatically cached after the first initialization to avoid expensive reloading. All `TopSecret::Text` instances share the same cached model, significantly improving performance.
|
|
684
|
+
|
|
685
|
+
If you need to clear the cache (e.g., after changing the model path), use:
|
|
686
|
+
|
|
687
|
+
```ruby
|
|
688
|
+
TopSecret::Text.clear_model_cache!
|
|
689
|
+
```
|
|
690
|
+
|
|
571
691
|
### Disabling NER filtering
|
|
572
692
|
|
|
573
693
|
For improved performance or when the MITIE model file cannot be deployed, you can disable NER-based filtering entirely. This will disable people and location detection but retain all regex-based filters (credit cards, emails, phone numbers, SSNs):
|
|
@@ -624,6 +744,39 @@ TopSecret.configure do |config|
|
|
|
624
744
|
end
|
|
625
745
|
```
|
|
626
746
|
|
|
747
|
+
## Production
|
|
748
|
+
|
|
749
|
+
### Deploying the Model File
|
|
750
|
+
|
|
751
|
+
Due to the large size of `ner_model.dat`, you'll want to avoid committing it to version control. Use [Trove][] to handle deploying the model file in production:
|
|
752
|
+
|
|
753
|
+
```ruby
|
|
754
|
+
# Rakefile
|
|
755
|
+
Rake::Task["assets:precompile"].enhance do
|
|
756
|
+
Trove.pull
|
|
757
|
+
end
|
|
758
|
+
```
|
|
759
|
+
|
|
760
|
+
This ensures the model file is downloaded during deployment.
|
|
761
|
+
|
|
762
|
+
### Eager-Loading the Model
|
|
763
|
+
|
|
764
|
+
To avoid cold-start performance issues, consider eager-loading the model after initialization. This must happen after initialization because the `.dat` file needs to exist first:
|
|
765
|
+
|
|
766
|
+
```ruby
|
|
767
|
+
# config/application.rb
|
|
768
|
+
|
|
769
|
+
config.after_initialize do
|
|
770
|
+
if TopSecret.model_path && File.exist?(TopSecret.model_path)
|
|
771
|
+
TopSecret::Text.shared_model
|
|
772
|
+
end
|
|
773
|
+
end
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
This pre-loads the MITIE model into the shared cache, ensuring the first request doesn't incur the model loading overhead.
|
|
777
|
+
|
|
778
|
+
[Trove]: https://github.com/ankane/trove
|
|
779
|
+
|
|
627
780
|
## Development
|
|
628
781
|
|
|
629
782
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/string/inflections"
|
|
4
|
+
|
|
5
|
+
module TopSecret
|
|
6
|
+
# Represents a category of sensitive information (e.g., email, person, credit_card).
|
|
7
|
+
#
|
|
8
|
+
# Each category derives a set of method names from its type and can resolve
|
|
9
|
+
# those methods against a mapping to return filtered results.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# category = TopSecret::Category.new(:email)
|
|
13
|
+
# category.plural # => :emails
|
|
14
|
+
# category.predicate # => :emails?
|
|
15
|
+
# category.mapping_method # => :email_mapping
|
|
16
|
+
class Category
|
|
17
|
+
MAPPING_SUFFIX = "_mapping"
|
|
18
|
+
|
|
19
|
+
# @return [String] the category type (e.g., "email", "credit_card")
|
|
20
|
+
attr_reader :type
|
|
21
|
+
|
|
22
|
+
# Builds categories from a mapping's keys and a list of filters.
|
|
23
|
+
#
|
|
24
|
+
# @param mapping [Hash] the label-to-value mapping (e.g., +{EMAIL_1: "ralph@example.com"}+)
|
|
25
|
+
# @param filters [Array<TopSecret::Filters::Regex, TopSecret::Filters::NER>] active filters
|
|
26
|
+
# @return [Array<Category>] unique categories derived from the mapping and filters
|
|
27
|
+
def self.from(mapping:, filters:)
|
|
28
|
+
types_from_mapping = mapping.keys.map { |key| type_from_key(key).downcase }
|
|
29
|
+
|
|
30
|
+
types_from_filters = filters.map { |filter| filter.label.downcase }
|
|
31
|
+
|
|
32
|
+
(types_from_mapping + types_from_filters).uniq.map { |type| new(type) }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Extracts the label type from a key symbol.
|
|
36
|
+
#
|
|
37
|
+
# @param key [Symbol] a label key (e.g., :EMAIL_1, :CREDIT_CARD_2)
|
|
38
|
+
# @return [String] the label type (e.g., "EMAIL", "CREDIT_CARD")
|
|
39
|
+
def self.type_from_key(key)
|
|
40
|
+
key.to_s.rpartition(TopSecret::LABEL_DELIMITER).first
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @param type [String, Symbol] the category type
|
|
44
|
+
def initialize(type)
|
|
45
|
+
@type = type.to_s
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Whether this category recognizes the given method name.
|
|
49
|
+
#
|
|
50
|
+
# @param method_name [Symbol] the method name to check
|
|
51
|
+
# @return [Boolean]
|
|
52
|
+
def respond_to_method?(method_name)
|
|
53
|
+
method_names.include?(method_name)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# @return [Symbol] the pluralized type (e.g., +:emails+)
|
|
57
|
+
def plural
|
|
58
|
+
@type.pluralize.to_sym
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [Symbol] the predicate method name (e.g., +:emails?+)
|
|
62
|
+
def predicate
|
|
63
|
+
:"#{plural}?"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# @return [Symbol] the mapping method name (e.g., +:email_mapping+)
|
|
67
|
+
def mapping_method
|
|
68
|
+
if @type.end_with?(MAPPING_SUFFIX)
|
|
69
|
+
:"#{@type.pluralize}#{MAPPING_SUFFIX}"
|
|
70
|
+
else
|
|
71
|
+
:"#{@type}#{MAPPING_SUFFIX}"
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# @return [Symbol] the mapping predicate method name (e.g., +:email_mapping?+)
|
|
76
|
+
def mapping_predicate
|
|
77
|
+
:"#{mapping_method}?"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Whether the mapping contains any keys belonging to this category.
|
|
81
|
+
#
|
|
82
|
+
# @param mapping [Hash] the label-to-value mapping
|
|
83
|
+
# @return [Boolean]
|
|
84
|
+
def matches?(mapping)
|
|
85
|
+
mapping.any? { |key, _| key.to_s.match?(key_pattern) }
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Resolves a method name against the mapping, returning the appropriate result.
|
|
89
|
+
#
|
|
90
|
+
# @param method_name [Symbol] one of {#plural}, {#predicate}, {#mapping_method}, or {#mapping_predicate}
|
|
91
|
+
# @param mapping [Hash] the label-to-value mapping
|
|
92
|
+
# @return [Hash, Array, Boolean] filtered mapping, values, or boolean depending on the method
|
|
93
|
+
# @raise [ArgumentError] if the method name is not recognized
|
|
94
|
+
def resolve(method_name, mapping)
|
|
95
|
+
filtered = filter_mapping(mapping)
|
|
96
|
+
|
|
97
|
+
case method_name
|
|
98
|
+
when mapping_method then filtered
|
|
99
|
+
when plural then filtered.values
|
|
100
|
+
when predicate, mapping_predicate then filtered.any?
|
|
101
|
+
else
|
|
102
|
+
raise ArgumentError, "#{method_name} is not a recognized method for category '#{@type}'"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
def method_names
|
|
109
|
+
@method_names ||= Set[plural, predicate, mapping_method, mapping_predicate].freeze
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def key_pattern
|
|
113
|
+
@key_pattern ||= /\A#{Regexp.escape(@type.upcase)}#{Regexp.escape(TopSecret::LABEL_DELIMITER)}\d+\z/
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def filter_mapping(mapping)
|
|
117
|
+
mapping.select { |key, _| key.to_s.match?(key_pattern) }
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
data/lib/top_secret/constants.rb
CHANGED
data/lib/top_secret/error.rb
CHANGED
data/lib/top_secret/mapping.rb
CHANGED
|
@@ -1,6 +1,34 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module TopSecret
|
|
4
|
+
# Provides dynamic category methods for querying sensitive information by type.
|
|
5
|
+
#
|
|
6
|
+
# This module automatically generates methods for accessing sensitive information
|
|
7
|
+
# organized by category (emails, credit cards, people, etc.). Methods are available
|
|
8
|
+
# for all default filter types and any custom labels used in the mapping.
|
|
9
|
+
#
|
|
10
|
+
# @example Querying emails
|
|
11
|
+
# result = TopSecret::Text.filter("Contact ralph@example.com")
|
|
12
|
+
# result.emails? # => true
|
|
13
|
+
# result.emails # => ["ralph@example.com"]
|
|
14
|
+
# result.email_mapping # => {:EMAIL_1=>"ralph@example.com"}
|
|
15
|
+
#
|
|
16
|
+
# @example With no matches
|
|
17
|
+
# result = TopSecret::Text.filter("No sensitive data")
|
|
18
|
+
# result.emails? # => false
|
|
19
|
+
# result.emails # => []
|
|
20
|
+
# result.email_mapping # => {}
|
|
21
|
+
#
|
|
22
|
+
# @example Custom labels
|
|
23
|
+
# result = TopSecret::Text.filter(
|
|
24
|
+
# "user[at]example.com",
|
|
25
|
+
# email_filter: TopSecret::Filters::Regex.new(
|
|
26
|
+
# label: "EMAIL_ADDRESS",
|
|
27
|
+
# regex: /\w+\[at\]\w+\.\w+/
|
|
28
|
+
# )
|
|
29
|
+
# )
|
|
30
|
+
# result.email_addresses # => ["user[at]example.com"]
|
|
31
|
+
# result.email_address_mapping # => {:EMAIL_ADDRESS_1=>"user[at]example.com"}
|
|
4
32
|
module Mapping
|
|
5
33
|
# @return [Boolean] Whether sensitive information was found
|
|
6
34
|
def sensitive?
|
|
@@ -11,5 +39,49 @@ module TopSecret
|
|
|
11
39
|
def safe?
|
|
12
40
|
!sensitive?
|
|
13
41
|
end
|
|
42
|
+
|
|
43
|
+
def method_missing(method_name, *args, &block)
|
|
44
|
+
category = category_for(method_name)
|
|
45
|
+
|
|
46
|
+
if category
|
|
47
|
+
result = category.resolve(method_name, mapping)
|
|
48
|
+
define_singleton_method(method_name) { result }
|
|
49
|
+
result
|
|
50
|
+
else
|
|
51
|
+
super
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def respond_to_missing?(method_name, include_private = false)
|
|
56
|
+
category_for(method_name) || super
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Returns categories that have matches in the mapping.
|
|
60
|
+
#
|
|
61
|
+
# @return [Array<Symbol>] List of categories with matches
|
|
62
|
+
def categories
|
|
63
|
+
@categories ||= category_objects.select { |c| c.matches?(mapping) }.map { |c| c.type.to_sym }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def category_objects
|
|
69
|
+
@category_objects ||= Category.from(mapping:, filters: default_filters)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def category_for(method_name)
|
|
73
|
+
category_objects.find { |c| c.respond_to_method?(method_name) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def default_filters
|
|
77
|
+
@default_filters ||= [
|
|
78
|
+
TopSecret.credit_card_filter,
|
|
79
|
+
TopSecret.email_filter,
|
|
80
|
+
TopSecret.phone_number_filter,
|
|
81
|
+
TopSecret.ssn_filter,
|
|
82
|
+
TopSecret.people_filter,
|
|
83
|
+
TopSecret.location_filter
|
|
84
|
+
].compact
|
|
85
|
+
end
|
|
14
86
|
end
|
|
15
87
|
end
|
|
@@ -16,7 +16,7 @@ module TopSecret
|
|
|
16
16
|
# Creates a new GlobalMapping instance
|
|
17
17
|
def initialize
|
|
18
18
|
@mapping = {}
|
|
19
|
-
@
|
|
19
|
+
@sequence = LabelSequence.new
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
# Builds the global mapping by processing all individual results
|
|
@@ -32,7 +32,7 @@ module TopSecret
|
|
|
32
32
|
private
|
|
33
33
|
|
|
34
34
|
attr_reader :mapping
|
|
35
|
-
attr_reader :
|
|
35
|
+
attr_reader :sequence
|
|
36
36
|
|
|
37
37
|
# Processes a single result, adding new values to the global mapping
|
|
38
38
|
#
|
|
@@ -41,23 +41,10 @@ module TopSecret
|
|
|
41
41
|
result.mapping.each do |individual_key, value|
|
|
42
42
|
next if mapping.key?(value)
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
label_type = Category.type_from_key(individual_key)
|
|
45
|
+
mapping[value] = sequence.next_label(label_type)
|
|
45
46
|
end
|
|
46
47
|
end
|
|
47
|
-
|
|
48
|
-
# Generates a consistent global key for a given individual key
|
|
49
|
-
#
|
|
50
|
-
# @param individual_key [Symbol] The individual key from a filter result
|
|
51
|
-
# @return [Symbol] The global key with consistent numbering
|
|
52
|
-
def generate_global_key(individual_key)
|
|
53
|
-
# TODO: This assumes labels are formatted consistently.
|
|
54
|
-
# We need to account for the following for the case where a label could begin with an "_"
|
|
55
|
-
label_type = individual_key.to_s.rpartition("_").first
|
|
56
|
-
|
|
57
|
-
label_counters[label_type] ||= 0
|
|
58
|
-
label_counters[label_type] += 1
|
|
59
|
-
:"#{label_type}_#{label_counters[label_type]}"
|
|
60
|
-
end
|
|
61
48
|
end
|
|
62
49
|
end
|
|
63
50
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TopSecret
|
|
4
|
+
class Text
|
|
5
|
+
# Generates unique, sequenced label symbols for each label type.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# sequence = TopSecret::Text::LabelSequence.new
|
|
9
|
+
# sequence.next_label("EMAIL") # => :EMAIL_1
|
|
10
|
+
# sequence.next_label("EMAIL") # => :EMAIL_2
|
|
11
|
+
# sequence.next_label("PERSON") # => :PERSON_1
|
|
12
|
+
class LabelSequence
|
|
13
|
+
# Creates a new LabelSequence instance with all counters at zero.
|
|
14
|
+
def initialize
|
|
15
|
+
@counters = Hash.new(0)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Returns the next sequenced label for the given label type.
|
|
19
|
+
#
|
|
20
|
+
# @param label_type [String] the label type (e.g., "EMAIL", "CREDIT_CARD")
|
|
21
|
+
# @return [Symbol] the sequenced label (e.g., :EMAIL_1, :EMAIL_2)
|
|
22
|
+
def next_label(label_type)
|
|
23
|
+
@counters[label_type] += 1
|
|
24
|
+
:"#{label_type}#{TopSecret::LABEL_DELIMITER}#{@counters[label_type]}"
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
data/lib/top_secret/text.rb
CHANGED
|
@@ -1,15 +1,47 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "active_support/core_ext/hash/keys"
|
|
4
|
+
require "active_support/core_ext/object/blank"
|
|
4
5
|
require_relative "null_model"
|
|
5
6
|
require_relative "text/result"
|
|
6
7
|
require_relative "text/batch_result"
|
|
7
8
|
require_relative "text/scan_result"
|
|
8
9
|
require_relative "text/global_mapping"
|
|
10
|
+
require_relative "text/label_sequence"
|
|
9
11
|
|
|
10
12
|
module TopSecret
|
|
11
13
|
# Processes text to identify and redact sensitive information using configured filters.
|
|
12
14
|
class Text
|
|
15
|
+
@mutex = Mutex.new
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
# Returns a cached MITIE model instance to avoid expensive reinitialization
|
|
19
|
+
#
|
|
20
|
+
# @return [Mitie::NER, NullModel] The cached model instance
|
|
21
|
+
def shared_model
|
|
22
|
+
return @shared_model if @shared_model
|
|
23
|
+
|
|
24
|
+
@mutex.synchronize do
|
|
25
|
+
return @shared_model if @shared_model
|
|
26
|
+
|
|
27
|
+
@shared_model = if TopSecret.model_path
|
|
28
|
+
Mitie::NER.new(TopSecret.model_path)
|
|
29
|
+
else
|
|
30
|
+
NullModel.new
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Clears the cached model, forcing reinitialization on next access
|
|
36
|
+
#
|
|
37
|
+
# @return [void]
|
|
38
|
+
def clear_model_cache!
|
|
39
|
+
@mutex.synchronize do
|
|
40
|
+
@shared_model = nil
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
13
45
|
# @param input [String] The original text to be filtered
|
|
14
46
|
# @param filters [Hash, nil] Optional set of filters to override the defaults
|
|
15
47
|
# @param custom_filters [Array] Additional custom filters to apply
|
|
@@ -170,6 +202,8 @@ module TopSecret
|
|
|
170
202
|
# @param label [String] Label identifying the filter type
|
|
171
203
|
# @return [void]
|
|
172
204
|
def build_mapping(values, label:)
|
|
205
|
+
validate_label! label
|
|
206
|
+
|
|
173
207
|
values.uniq.each.with_index(1) do |value, index|
|
|
174
208
|
filter = "#{label}_#{index}"
|
|
175
209
|
mapping.merge!({filter.to_sym => value})
|
|
@@ -208,6 +242,40 @@ module TopSecret
|
|
|
208
242
|
merged_filters.assert_valid_keys(*default_filters.keys)
|
|
209
243
|
end
|
|
210
244
|
|
|
245
|
+
# Validates that a label conforms to the required format for redaction placeholders.
|
|
246
|
+
#
|
|
247
|
+
# Labels must meet the following criteria:
|
|
248
|
+
# - Start with a letter (a-z, A-Z)
|
|
249
|
+
# - End with a letter (a-z, A-Z)
|
|
250
|
+
# - Contain only letters and single underscores (no consecutive underscores)
|
|
251
|
+
# - Not be blank or nil
|
|
252
|
+
#
|
|
253
|
+
# Valid labels will have `_N` appended automatically during mapping, where N is the sequence number.
|
|
254
|
+
#
|
|
255
|
+
# @param label [String] The label to validate
|
|
256
|
+
# @return [void]
|
|
257
|
+
# @raise [Error::MalformedLabel] If the label is blank or doesn't meet format requirements
|
|
258
|
+
#
|
|
259
|
+
# @example Valid labels
|
|
260
|
+
# validate_label!("EMAIL") # Valid
|
|
261
|
+
# validate_label!("IP_ADDRESS") # Valid
|
|
262
|
+
# validate_label!("CREDIT_CARD") # Valid
|
|
263
|
+
#
|
|
264
|
+
# @example Invalid labels
|
|
265
|
+
# validate_label!("_EMAIL") # Invalid - starts with underscore
|
|
266
|
+
# validate_label!("EMAIL_") # Invalid - ends with underscore
|
|
267
|
+
# validate_label!("EMAIL1") # Invalid - ends with digit
|
|
268
|
+
# validate_label!("EMAIL__ADDRESS") # Invalid - consecutive underscores
|
|
269
|
+
# validate_label!("EMAIL*ADDRESS") # Invalid - special characters
|
|
270
|
+
# validate_label!("") # Invalid - blank
|
|
271
|
+
def validate_label!(label)
|
|
272
|
+
raise Error::MalformedLabel, "You must provide a label." if label.blank?
|
|
273
|
+
|
|
274
|
+
unless label.match?(/\A[a-zA-Z]+(_[a-zA-Z]+)*\z/)
|
|
275
|
+
raise Error::MalformedLabel, "Unsupported label. Labels must contain only letters and underscores: '#{label}'"
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
211
279
|
# Returns the default filters configuration hash
|
|
212
280
|
#
|
|
213
281
|
# @return [Hash] Hash containing all configured default filters, keyed by filter name
|
|
@@ -224,15 +292,11 @@ module TopSecret
|
|
|
224
292
|
end
|
|
225
293
|
|
|
226
294
|
# Creates the default model based on configuration.
|
|
227
|
-
# Returns
|
|
295
|
+
# Returns the cached shared model to avoid expensive reinitialization.
|
|
228
296
|
#
|
|
229
297
|
# @return [Mitie::NER, NullModel] The model instance to use for NER processing
|
|
230
298
|
def default_model
|
|
231
|
-
|
|
232
|
-
Mitie::NER.new(TopSecret.model_path)
|
|
233
|
-
else
|
|
234
|
-
NullModel.new
|
|
235
|
-
end
|
|
299
|
+
Text.shared_model
|
|
236
300
|
end
|
|
237
301
|
end
|
|
238
302
|
end
|
data/lib/top_secret/version.rb
CHANGED
data/lib/top_secret.rb
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
# dependencies
|
|
4
|
-
require "active_support/
|
|
5
|
-
require "active_support/ordered_options"
|
|
4
|
+
require "active_support/core_ext/module/attribute_accessors"
|
|
6
5
|
require "mitie"
|
|
7
6
|
|
|
8
7
|
# modules
|
|
9
8
|
require_relative "top_secret/version"
|
|
10
9
|
require_relative "top_secret/constants"
|
|
10
|
+
require_relative "top_secret/category"
|
|
11
11
|
require_relative "top_secret/mapping"
|
|
12
12
|
require_relative "top_secret/filters/ner"
|
|
13
13
|
require_relative "top_secret/filters/regex"
|
|
@@ -44,17 +44,21 @@ require_relative "top_secret/filtered_text"
|
|
|
44
44
|
# @!attribute [rw] location_filter
|
|
45
45
|
# @return [TopSecret::Filters::NER] filter for location names
|
|
46
46
|
module TopSecret
|
|
47
|
-
|
|
47
|
+
mattr_accessor :model_path, default: MODEL_PATH
|
|
48
|
+
mattr_accessor :min_confidence_score, default: MIN_CONFIDENCE_SCORE
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
config_accessor :min_confidence_score, default: MIN_CONFIDENCE_SCORE
|
|
50
|
+
mattr_accessor :custom_filters, default: []
|
|
51
51
|
|
|
52
|
-
|
|
52
|
+
mattr_accessor :credit_card_filter, default: TopSecret::Filters::Regex.new(label: "CREDIT_CARD", regex: CREDIT_CARD_REGEX)
|
|
53
|
+
mattr_accessor :email_filter, default: TopSecret::Filters::Regex.new(label: "EMAIL", regex: EMAIL_REGEX)
|
|
54
|
+
mattr_accessor :phone_number_filter, default: TopSecret::Filters::Regex.new(label: "PHONE_NUMBER", regex: PHONE_REGEX)
|
|
55
|
+
mattr_accessor :ssn_filter, default: TopSecret::Filters::Regex.new(label: "SSN", regex: SSN_REGEX)
|
|
56
|
+
mattr_accessor :people_filter, default: TopSecret::Filters::NER.new(label: "PERSON", tag: :person)
|
|
57
|
+
mattr_accessor :location_filter, default: TopSecret::Filters::NER.new(label: "LOCATION", tag: :location)
|
|
53
58
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
config_accessor :location_filter, default: TopSecret::Filters::NER.new(label: "LOCATION", tag: :location)
|
|
59
|
+
class << self
|
|
60
|
+
def configure
|
|
61
|
+
yield self
|
|
62
|
+
end
|
|
63
|
+
end
|
|
60
64
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: top_secret
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steve Polito
|
|
@@ -58,6 +58,7 @@ files:
|
|
|
58
58
|
- README.md
|
|
59
59
|
- Rakefile
|
|
60
60
|
- lib/top_secret.rb
|
|
61
|
+
- lib/top_secret/category.rb
|
|
61
62
|
- lib/top_secret/constants.rb
|
|
62
63
|
- lib/top_secret/error.rb
|
|
63
64
|
- lib/top_secret/filtered_text.rb
|
|
@@ -69,6 +70,7 @@ files:
|
|
|
69
70
|
- lib/top_secret/text.rb
|
|
70
71
|
- lib/top_secret/text/batch_result.rb
|
|
71
72
|
- lib/top_secret/text/global_mapping.rb
|
|
73
|
+
- lib/top_secret/text/label_sequence.rb
|
|
72
74
|
- lib/top_secret/text/result.rb
|
|
73
75
|
- lib/top_secret/text/scan_result.rb
|
|
74
76
|
- lib/top_secret/version.rb
|
|
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
94
96
|
- !ruby/object:Gem::Version
|
|
95
97
|
version: '0'
|
|
96
98
|
requirements: []
|
|
97
|
-
rubygems_version: 3.
|
|
99
|
+
rubygems_version: 3.7.2
|
|
98
100
|
specification_version: 4
|
|
99
101
|
summary: Filter sensitive information from free text.
|
|
100
102
|
test_files: []
|