data_redactor 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -1
- data/ext/data_redactor/custom_patterns.c +123 -0
- data/ext/data_redactor/custom_patterns.h +25 -0
- data/ext/data_redactor/data_redactor.c +26 -1001
- data/ext/data_redactor/extconf.rb +4 -0
- data/ext/data_redactor/patterns.c +455 -0
- data/ext/data_redactor/patterns.h +16 -0
- data/ext/data_redactor/placeholder.c +54 -0
- data/ext/data_redactor/placeholder.h +30 -0
- data/ext/data_redactor/redact.c +160 -0
- data/ext/data_redactor/redact.h +35 -0
- data/ext/data_redactor/scan.c +131 -0
- data/ext/data_redactor/scan.h +12 -0
- data/ext/data_redactor/tags.h +24 -0
- data/lib/data_redactor/integrations/logger.rb +42 -0
- data/lib/data_redactor/integrations/rack.rb +121 -0
- data/lib/data_redactor/integrations/rails.rb +38 -0
- data/lib/data_redactor/version.rb +2 -1
- data/lib/data_redactor.rb +247 -45
- data/readme.md +110 -24
- metadata +48 -5
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#include "redact.h"
|
|
2
|
+
#include "patterns.h"
|
|
3
|
+
#include "placeholder.h"
|
|
4
|
+
#include "custom_patterns.h"
|
|
5
|
+
#include <string.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
#include <stdio.h>
|
|
8
|
+
|
|
9
|
+
char *wrap_boundary(const char *core) {
|
|
10
|
+
const char *prefix = "(^|[^0-9A-Za-z])(";
|
|
11
|
+
const char *suffix = ")([^0-9A-Za-z]|$)";
|
|
12
|
+
size_t len = strlen(prefix) + strlen(core) + strlen(suffix) + 1;
|
|
13
|
+
char *buf = (char *)malloc(len);
|
|
14
|
+
if (!buf) return NULL;
|
|
15
|
+
snprintf(buf, len, "%s%s%s", prefix, core, suffix);
|
|
16
|
+
return buf;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/*
|
|
20
|
+
* Replace all occurrences of a compiled pattern in `input` with PLACEHOLDER.
|
|
21
|
+
*
|
|
22
|
+
* If `use_boundary` is non-zero the pattern was compiled as:
|
|
23
|
+
* (^|[^0-9A-Za-z])(CORE)([^0-9A-Za-z]|$)
|
|
24
|
+
* groups: [0]=full match [1]=left boundary [2]=CORE [3]=right boundary
|
|
25
|
+
* We pass nmatch=4 so the engine fills all four slots, then use matches[1].rm_eo
|
|
26
|
+
* and matches[3].rm_so to locate the exact CORE span. The boundary characters
|
|
27
|
+
* are copied back verbatim so they are not lost.
|
|
28
|
+
*/
|
|
29
|
+
char *replace_all_matches(regex_t *pattern, const char *input,
|
|
30
|
+
int use_boundary, const placeholder_t *ph) {
|
|
31
|
+
size_t ph_max = max_placeholder_len(ph);
|
|
32
|
+
size_t out_cap = strlen(input) * 2 + 512;
|
|
33
|
+
char *output = (char *)malloc(out_cap);
|
|
34
|
+
if (!output) return NULL;
|
|
35
|
+
|
|
36
|
+
char *ph_buf = (char *)malloc(ph_max + 1);
|
|
37
|
+
if (!ph_buf) { free(output); return NULL; }
|
|
38
|
+
|
|
39
|
+
size_t out_len = 0;
|
|
40
|
+
const char *cursor = input;
|
|
41
|
+
regmatch_t matches[4];
|
|
42
|
+
|
|
43
|
+
while (regexec(pattern, cursor, 4, matches, 0) == 0) {
|
|
44
|
+
regoff_t full_so = matches[0].rm_so;
|
|
45
|
+
regoff_t full_eo = matches[0].rm_eo;
|
|
46
|
+
|
|
47
|
+
if (full_so < 0 || full_eo < full_so) break;
|
|
48
|
+
|
|
49
|
+
regoff_t core_so = full_so;
|
|
50
|
+
regoff_t core_eo = full_eo;
|
|
51
|
+
|
|
52
|
+
if (use_boundary) {
|
|
53
|
+
if (matches[1].rm_so >= 0 && matches[1].rm_eo > matches[1].rm_so)
|
|
54
|
+
core_so = matches[1].rm_eo;
|
|
55
|
+
if (matches[3].rm_so >= 0 && matches[3].rm_eo > matches[3].rm_so)
|
|
56
|
+
core_eo = matches[3].rm_so;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
size_t prefix_len = (size_t)core_so;
|
|
60
|
+
size_t suffix_len = (size_t)(full_eo - core_eo);
|
|
61
|
+
size_t match_len = (size_t)(full_eo - full_so);
|
|
62
|
+
size_t core_len = (size_t)(core_eo - core_so);
|
|
63
|
+
|
|
64
|
+
size_t ph_len = write_placeholder(ph_buf, ph, cursor + core_so, core_len);
|
|
65
|
+
|
|
66
|
+
size_t needed = out_len + prefix_len + ph_len + suffix_len + strlen(cursor + full_eo) + 1;
|
|
67
|
+
if (needed > out_cap) {
|
|
68
|
+
out_cap = needed * 2;
|
|
69
|
+
char *tmp = (char *)realloc(output, out_cap);
|
|
70
|
+
if (!tmp) { free(output); free(ph_buf); return NULL; }
|
|
71
|
+
output = tmp;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
memcpy(output + out_len, cursor, prefix_len);
|
|
75
|
+
out_len += prefix_len;
|
|
76
|
+
|
|
77
|
+
memcpy(output + out_len, ph_buf, ph_len);
|
|
78
|
+
out_len += ph_len;
|
|
79
|
+
|
|
80
|
+
if (suffix_len > 0) {
|
|
81
|
+
memcpy(output + out_len, cursor + core_eo, suffix_len);
|
|
82
|
+
out_len += suffix_len;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
cursor += full_eo;
|
|
86
|
+
|
|
87
|
+
if (match_len == 0) {
|
|
88
|
+
if (*cursor) output[out_len++] = *cursor++;
|
|
89
|
+
else break;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
free(ph_buf);
|
|
93
|
+
|
|
94
|
+
size_t tail_len = strlen(cursor);
|
|
95
|
+
size_t needed = out_len + tail_len + 1;
|
|
96
|
+
if (needed > out_cap) {
|
|
97
|
+
out_cap = needed;
|
|
98
|
+
char *tmp = (char *)realloc(output, out_cap);
|
|
99
|
+
if (!tmp) { free(output); return NULL; }
|
|
100
|
+
output = tmp;
|
|
101
|
+
}
|
|
102
|
+
memcpy(output + out_len, cursor, tail_len);
|
|
103
|
+
out_len += tail_len;
|
|
104
|
+
output[out_len] = '\0';
|
|
105
|
+
|
|
106
|
+
return output;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/* Look up the i-th entry of the enable_bits Array. Out-of-bounds → 0 (skip). */
|
|
110
|
+
static inline int enable_bit(VALUE rb_enable_bits, long i) {
|
|
111
|
+
if (i < 0 || i >= RARRAY_LEN(rb_enable_bits)) return 0;
|
|
112
|
+
VALUE v = rb_ary_entry(rb_enable_bits, i);
|
|
113
|
+
return RTEST(v) && NUM2INT(v) != 0;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
117
|
+
VALUE rb_ph_mode, VALUE rb_ph_str,
|
|
118
|
+
VALUE rb_enable_bits) {
|
|
119
|
+
Check_Type(rb_text, T_STRING);
|
|
120
|
+
Check_Type(rb_ph_str, T_STRING);
|
|
121
|
+
Check_Type(rb_enable_bits, T_ARRAY);
|
|
122
|
+
|
|
123
|
+
int ph_mode = NUM2INT(rb_ph_mode);
|
|
124
|
+
const char *ph_str_plain = StringValueCStr(rb_ph_str);
|
|
125
|
+
|
|
126
|
+
const char *input = StringValueCStr(rb_text);
|
|
127
|
+
char *working = strdup(input);
|
|
128
|
+
if (!working) rb_raise(rb_eNoMemError, "strdup failed");
|
|
129
|
+
|
|
130
|
+
placeholder_t ph;
|
|
131
|
+
ph.mode = ph_mode;
|
|
132
|
+
|
|
133
|
+
for (int i = 0; i < NUM_PATTERNS; i++) {
|
|
134
|
+
if (!enable_bit(rb_enable_bits, i)) continue;
|
|
135
|
+
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
|
|
136
|
+
? ph_str_plain
|
|
137
|
+
: tag_name_for_bit(pattern_tags[i]);
|
|
138
|
+
char *result = replace_all_matches(&compiled_patterns[i], working,
|
|
139
|
+
boundary_wrapped[i], &ph);
|
|
140
|
+
free(working);
|
|
141
|
+
if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
|
|
142
|
+
working = result;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
for (int i = 0; i < custom_count; i++) {
|
|
146
|
+
if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
147
|
+
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
|
|
148
|
+
? ph_str_plain
|
|
149
|
+
: tag_name_for_bit(custom_patterns[i].tag);
|
|
150
|
+
char *result = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
151
|
+
custom_patterns[i].boundary, &ph);
|
|
152
|
+
free(working);
|
|
153
|
+
if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
|
|
154
|
+
working = result;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
VALUE rb_result = rb_str_new_cstr(working);
|
|
158
|
+
free(working);
|
|
159
|
+
return rb_result;
|
|
160
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#ifndef DATA_REDACTOR_REDACT_H
|
|
2
|
+
#define DATA_REDACTOR_REDACT_H
|
|
3
|
+
|
|
4
|
+
#include <ruby.h>
|
|
5
|
+
#include <regex.h>
|
|
6
|
+
#include "placeholder.h"
|
|
7
|
+
|
|
8
|
+
/*
|
|
9
|
+
* Build a boundary-wrapped version of a pattern:
|
|
10
|
+
* (^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)
|
|
11
|
+
* Caller must free the returned string.
|
|
12
|
+
*/
|
|
13
|
+
char *wrap_boundary(const char *core);
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* Replace all occurrences of a compiled pattern in `input` with the placeholder.
|
|
17
|
+
* Returns a newly malloc'd string (caller must free), or NULL on allocation failure.
|
|
18
|
+
*/
|
|
19
|
+
char *replace_all_matches(regex_t *pattern, const char *input,
|
|
20
|
+
int use_boundary, const placeholder_t *ph);
|
|
21
|
+
|
|
22
|
+
/*
|
|
23
|
+
* DataRedactor._redact(text, ph_mode, ph_str, enable_bits) -> String
|
|
24
|
+
*
|
|
25
|
+
* `enable_bits` is an Array of Integers, one per pattern index. The first
|
|
26
|
+
* NUM_PATTERNS entries cover built-ins; any extra entries cover currently
|
|
27
|
+
* registered custom patterns in registration order. A 0 entry skips that
|
|
28
|
+
* pattern entirely; non-zero runs it. The Ruby layer compiles `only:` /
|
|
29
|
+
* `except:` (mixed Symbol+String) into this array.
|
|
30
|
+
*/
|
|
31
|
+
VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
32
|
+
VALUE rb_ph_mode, VALUE rb_ph_str,
|
|
33
|
+
VALUE rb_enable_bits);
|
|
34
|
+
|
|
35
|
+
#endif
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
#include "scan.h"
|
|
2
|
+
#include "patterns.h"
|
|
3
|
+
#include "placeholder.h"
|
|
4
|
+
#include "custom_patterns.h"
|
|
5
|
+
#include "redact.h"
|
|
6
|
+
#include <regex.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
#include <stdlib.h>
|
|
9
|
+
|
|
10
|
+
/*
|
|
11
|
+
* To map working-buffer positions back to original-string positions we
|
|
12
|
+
* maintain a log of every replacement already applied. Each entry records
|
|
13
|
+
* where in the *working* buffer the replacement started (after all prior
|
|
14
|
+
* replacements) and how many bytes were removed (orig_len) vs. inserted
|
|
15
|
+
* (always 10, the length of "[REDACTED]").
|
|
16
|
+
*
|
|
17
|
+
* For a new match at working position W:
|
|
18
|
+
* cumulative_shift_before_W = sum of (10 - orig_len) for all prior
|
|
19
|
+
* replacements whose working_pos <= W
|
|
20
|
+
* original_pos = W - cumulative_shift_before_W
|
|
21
|
+
*/
|
|
22
|
+
/* Look up the i-th entry of the enable_bits Array. Out-of-bounds → 0 (skip). */
|
|
23
|
+
static inline int scan_enable_bit(VALUE rb_enable_bits, long i) {
|
|
24
|
+
if (i < 0 || i >= RARRAY_LEN(rb_enable_bits)) return 0;
|
|
25
|
+
VALUE v = rb_ary_entry(rb_enable_bits, i);
|
|
26
|
+
return RTEST(v) && NUM2INT(v) != 0;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
30
|
+
Check_Type(rb_text, T_STRING);
|
|
31
|
+
Check_Type(rb_enable_bits, T_ARRAY);
|
|
32
|
+
|
|
33
|
+
const char *input = StringValueCStr(rb_text);
|
|
34
|
+
|
|
35
|
+
static const placeholder_t ph_default = { PLACEHOLDER_MODE_PLAIN, "[REDACTED]" };
|
|
36
|
+
|
|
37
|
+
char *working = strdup(input);
|
|
38
|
+
if (!working) rb_raise(rb_eNoMemError, "strdup failed");
|
|
39
|
+
|
|
40
|
+
VALUE matches_arr = rb_ary_new();
|
|
41
|
+
|
|
42
|
+
typedef struct { long wpos; long orig_len; } repl_t;
|
|
43
|
+
repl_t *repl_log = NULL;
|
|
44
|
+
int repl_count = 0;
|
|
45
|
+
int repl_cap = 0;
|
|
46
|
+
|
|
47
|
+
#define REPL_LOG_PUSH(_wpos, _olen) do { \
|
|
48
|
+
if (repl_count >= repl_cap) { \
|
|
49
|
+
int _nc = repl_cap == 0 ? 16 : repl_cap * 2; \
|
|
50
|
+
repl_t *_t = (repl_t *)realloc(repl_log, sizeof(repl_t) * _nc); \
|
|
51
|
+
if (!_t) { free(repl_log); free(working); rb_raise(rb_eNoMemError, "repl_log"); } \
|
|
52
|
+
repl_log = _t; repl_cap = _nc; \
|
|
53
|
+
} \
|
|
54
|
+
repl_log[repl_count].wpos = (_wpos); \
|
|
55
|
+
repl_log[repl_count].orig_len = (_olen); \
|
|
56
|
+
repl_count++; \
|
|
57
|
+
} while (0)
|
|
58
|
+
|
|
59
|
+
#define WORKING_TO_ORIG(_wpos) ({ \
|
|
60
|
+
long _shift = 0; \
|
|
61
|
+
for (int _ri = 0; _ri < repl_count; _ri++) { \
|
|
62
|
+
if (repl_log[_ri].wpos <= (_wpos)) \
|
|
63
|
+
_shift += 10 - repl_log[_ri].orig_len; \
|
|
64
|
+
} \
|
|
65
|
+
(_wpos) - _shift; \
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
#define COLLECT_AND_REPLACE(pat, use_bnd, tag_bit, pat_name) do { \
|
|
69
|
+
const char *_cur = working; \
|
|
70
|
+
regmatch_t _m[4]; \
|
|
71
|
+
while (regexec((pat), _cur, 4, _m, 0) == 0) { \
|
|
72
|
+
regoff_t _fso = _m[0].rm_so, _feo = _m[0].rm_eo; \
|
|
73
|
+
if (_fso < 0 || _feo < _fso) break; \
|
|
74
|
+
regoff_t _cso = _fso, _ceo = _feo; \
|
|
75
|
+
if (use_bnd) { \
|
|
76
|
+
if (_m[1].rm_so >= 0 && _m[1].rm_eo > _m[1].rm_so) \
|
|
77
|
+
_cso = _m[1].rm_eo; \
|
|
78
|
+
if (_m[3].rm_so >= 0 && _m[3].rm_eo > _m[3].rm_so) \
|
|
79
|
+
_ceo = _m[3].rm_so; \
|
|
80
|
+
} \
|
|
81
|
+
size_t _vlen = (size_t)(_ceo - _cso); \
|
|
82
|
+
long _wpos = (long)(_cur - working) + (long)_cso; \
|
|
83
|
+
long _orig = WORKING_TO_ORIG(_wpos); \
|
|
84
|
+
VALUE _match = rb_hash_new(); \
|
|
85
|
+
rb_hash_aset(_match, ID2SYM(rb_intern("tag")), \
|
|
86
|
+
ID2SYM(rb_intern(tag_name_for_bit(tag_bit)))); \
|
|
87
|
+
rb_hash_aset(_match, ID2SYM(rb_intern("name")), \
|
|
88
|
+
rb_str_new_cstr(pat_name)); \
|
|
89
|
+
rb_hash_aset(_match, ID2SYM(rb_intern("value")), \
|
|
90
|
+
rb_str_new(_cur + _cso, _vlen)); \
|
|
91
|
+
rb_hash_aset(_match, ID2SYM(rb_intern("start")), \
|
|
92
|
+
LONG2NUM(_orig)); \
|
|
93
|
+
rb_hash_aset(_match, ID2SYM(rb_intern("length")), \
|
|
94
|
+
LONG2NUM((long)_vlen)); \
|
|
95
|
+
rb_ary_push(matches_arr, _match); \
|
|
96
|
+
REPL_LOG_PUSH(_wpos, (long)_vlen); \
|
|
97
|
+
if (_feo == _fso) { if (*_cur) _cur++; else break; } \
|
|
98
|
+
else _cur += _feo; \
|
|
99
|
+
} \
|
|
100
|
+
char *_next = replace_all_matches((pat), working, (use_bnd), &ph_default); \
|
|
101
|
+
free(working); \
|
|
102
|
+
if (!_next) { free(repl_log); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); } \
|
|
103
|
+
working = _next; \
|
|
104
|
+
} while (0)
|
|
105
|
+
|
|
106
|
+
for (int i = 0; i < NUM_PATTERNS; i++) {
|
|
107
|
+
if (!scan_enable_bit(rb_enable_bits, i)) continue;
|
|
108
|
+
COLLECT_AND_REPLACE(&compiled_patterns[i], boundary_wrapped[i],
|
|
109
|
+
pattern_tags[i], pattern_names[i]);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
for (int i = 0; i < custom_count; i++) {
|
|
113
|
+
if (!scan_enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
114
|
+
COLLECT_AND_REPLACE(&custom_patterns[i].compiled,
|
|
115
|
+
custom_patterns[i].boundary,
|
|
116
|
+
custom_patterns[i].tag, custom_patterns[i].name);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#undef COLLECT_AND_REPLACE
|
|
120
|
+
#undef WORKING_TO_ORIG
|
|
121
|
+
#undef REPL_LOG_PUSH
|
|
122
|
+
|
|
123
|
+
free(repl_log);
|
|
124
|
+
|
|
125
|
+
VALUE result = rb_hash_new();
|
|
126
|
+
VALUE rb_redacted = rb_str_new_cstr(working);
|
|
127
|
+
free(working);
|
|
128
|
+
rb_hash_aset(result, ID2SYM(rb_intern("redacted")), rb_redacted);
|
|
129
|
+
rb_hash_aset(result, ID2SYM(rb_intern("matches")), matches_arr);
|
|
130
|
+
return result;
|
|
131
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#ifndef DATA_REDACTOR_SCAN_H
|
|
2
|
+
#define DATA_REDACTOR_SCAN_H
|
|
3
|
+
|
|
4
|
+
#include <ruby.h>
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* DataRedactor._scan(text, enable_bits) -> { redacted: String, matches: Array<Hash> }
|
|
8
|
+
* enable_bits: same per-pattern 0/1 array as _redact.
|
|
9
|
+
*/
|
|
10
|
+
VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits);
|
|
11
|
+
|
|
12
|
+
#endif
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#ifndef DATA_REDACTOR_TAGS_H
|
|
2
|
+
#define DATA_REDACTOR_TAGS_H
|
|
3
|
+
|
|
4
|
+
/*
|
|
5
|
+
* Tag bits. Each pattern belongs to exactly one tag. Callers can pass a
|
|
6
|
+
* bitmask to restrict which patterns run (only / except). The default mask
|
|
7
|
+
* (TAG_ALL) runs every pattern and matches the historical behaviour of
|
|
8
|
+
* `redact(text)` with no second argument.
|
|
9
|
+
*/
|
|
10
|
+
#define TAG_CREDENTIALS (1 << 0)
|
|
11
|
+
#define TAG_FINANCIAL (1 << 1)
|
|
12
|
+
#define TAG_TAX_ID (1 << 2)
|
|
13
|
+
#define TAG_NATIONAL_ID (1 << 3)
|
|
14
|
+
#define TAG_CONTACT (1 << 4)
|
|
15
|
+
#define TAG_NETWORK (1 << 5)
|
|
16
|
+
#define TAG_TRAVEL (1 << 6)
|
|
17
|
+
#define TAG_OTHER (1 << 7)
|
|
18
|
+
#define TAG_CUSTOM (1 << 8)
|
|
19
|
+
#define TAG_BUILTIN_ALL (TAG_CREDENTIALS | TAG_FINANCIAL | TAG_TAX_ID | \
|
|
20
|
+
TAG_NATIONAL_ID | TAG_CONTACT | TAG_NETWORK | \
|
|
21
|
+
TAG_TRAVEL | TAG_OTHER)
|
|
22
|
+
#define TAG_ALL (TAG_BUILTIN_ALL | TAG_CUSTOM)
|
|
23
|
+
|
|
24
|
+
#endif
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
require "logger"
|
|
2
|
+
require "data_redactor"
|
|
3
|
+
|
|
4
|
+
module DataRedactor
|
|
5
|
+
module Integrations
|
|
6
|
+
# Logger formatter that runs every log message through {DataRedactor.redact}
|
|
7
|
+
# before delegating to an inner formatter.
|
|
8
|
+
#
|
|
9
|
+
# @example Drop-in replacement for Ruby's default formatter
|
|
10
|
+
# logger = Logger.new($stdout)
|
|
11
|
+
# logger.formatter = DataRedactor::Integrations::Logger.new
|
|
12
|
+
# logger.info("Auth failed for user alice@example.com")
|
|
13
|
+
# # => "I, [...] -- : Auth failed for user [REDACTED]"
|
|
14
|
+
#
|
|
15
|
+
# @example Wrapping an existing formatter (e.g. Rails JSON logger)
|
|
16
|
+
# logger.formatter = DataRedactor::Integrations::Logger.new(
|
|
17
|
+
# inner: Rails.logger.formatter,
|
|
18
|
+
# only: [:credentials, :contact]
|
|
19
|
+
# )
|
|
20
|
+
class Logger
|
|
21
|
+
# @param inner [#call, nil] formatter to wrap. Defaults to {::Logger::Formatter}.
|
|
22
|
+
# @param only [Symbol, String, Array, nil] forwarded to {DataRedactor.redact}.
|
|
23
|
+
# @param except [Symbol, String, Array, nil] forwarded to {DataRedactor.redact}.
|
|
24
|
+
# @param placeholder forwarded to {DataRedactor.redact}.
|
|
25
|
+
def initialize(inner: ::Logger::Formatter.new, only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
|
|
26
|
+
@inner = inner
|
|
27
|
+
@only = only
|
|
28
|
+
@except = except
|
|
29
|
+
@placeholder = placeholder
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Formatter contract — called by Logger for every emitted line.
|
|
33
|
+
# Lets the inner formatter render whatever it likes (string, exception,
|
|
34
|
+
# arbitrary object) and scrubs the resulting line in one pass. Keeps the
|
|
35
|
+
# exception cause chain intact so downstream formatters still see it.
|
|
36
|
+
def call(severity, time, progname, msg)
|
|
37
|
+
line = @inner.call(severity, time, progname, msg)
|
|
38
|
+
DataRedactor.redact(line.to_s, only: @only, except: @except, placeholder: @placeholder)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
require "data_redactor"
|
|
2
|
+
|
|
3
|
+
module DataRedactor
|
|
4
|
+
module Integrations
|
|
5
|
+
# Rack middleware that scrubs sensitive data from selectable surfaces of
|
|
6
|
+
# the response (and request headers, for downstream loggers to see scrubbed
|
|
7
|
+
# values).
|
|
8
|
+
#
|
|
9
|
+
# @example Both surfaces (default)
|
|
10
|
+
# use DataRedactor::Integrations::Rack, scrub: [:body, :headers]
|
|
11
|
+
#
|
|
12
|
+
# @example Headers only — leave the response body untouched
|
|
13
|
+
# use DataRedactor::Integrations::Rack, scrub: [:headers]
|
|
14
|
+
#
|
|
15
|
+
# ### Surfaces
|
|
16
|
+
#
|
|
17
|
+
# - `:body` — wraps the response body so emitted bytes pass through
|
|
18
|
+
# {DataRedactor.redact} before reaching the client. Drops the
|
|
19
|
+
# `Content-Length` header (the redacted body may have a different
|
|
20
|
+
# byte length, and recomputing requires buffering).
|
|
21
|
+
# - `:headers` — scrubs response headers in place. Sensitive request
|
|
22
|
+
# headers (`Authorization`, `Cookie`, `X-Api-Key`, etc.) are redacted in
|
|
23
|
+
# the env hash so any downstream middleware that logs them sees scrubbed
|
|
24
|
+
# values.
|
|
25
|
+
class Rack
|
|
26
|
+
DEFAULT_SCRUB = [:body, :headers].freeze
|
|
27
|
+
|
|
28
|
+
SENSITIVE_REQUEST_HEADERS = %w[
|
|
29
|
+
HTTP_AUTHORIZATION
|
|
30
|
+
HTTP_PROXY_AUTHORIZATION
|
|
31
|
+
HTTP_COOKIE
|
|
32
|
+
HTTP_X_API_KEY
|
|
33
|
+
HTTP_X_AUTH_TOKEN
|
|
34
|
+
HTTP_X_ACCESS_TOKEN
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
SENSITIVE_RESPONSE_HEADERS = %w[
|
|
38
|
+
Set-Cookie
|
|
39
|
+
Authorization
|
|
40
|
+
X-Api-Key
|
|
41
|
+
X-Auth-Token
|
|
42
|
+
X-Access-Token
|
|
43
|
+
].freeze
|
|
44
|
+
|
|
45
|
+
# @param app [#call] the Rack app
|
|
46
|
+
# @param scrub [Array<Symbol>] which surfaces to redact. Subset of
|
|
47
|
+
# `[:body, :headers]`. Defaults to `[:body, :headers]`.
|
|
48
|
+
# @param only forwarded to {DataRedactor.redact}
|
|
49
|
+
# @param except forwarded to {DataRedactor.redact}
|
|
50
|
+
# @param placeholder forwarded to {DataRedactor.redact}
|
|
51
|
+
def initialize(app, scrub: DEFAULT_SCRUB, only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
|
|
52
|
+
@app = app
|
|
53
|
+
@scrub = Array(scrub).map(&:to_sym)
|
|
54
|
+
unknown = @scrub - [:body, :headers]
|
|
55
|
+
unless unknown.empty?
|
|
56
|
+
raise ArgumentError, "unknown scrub surface(s) #{unknown.inspect}; valid: [:body, :headers]"
|
|
57
|
+
end
|
|
58
|
+
@only = only
|
|
59
|
+
@except = except
|
|
60
|
+
@placeholder = placeholder
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def call(env)
|
|
64
|
+
scrub_request_headers(env) if @scrub.include?(:headers)
|
|
65
|
+
status, headers, body = @app.call(env)
|
|
66
|
+
headers = scrub_response_headers(headers) if @scrub.include?(:headers)
|
|
67
|
+
if @scrub.include?(:body)
|
|
68
|
+
body, headers = wrap_body(body, headers)
|
|
69
|
+
end
|
|
70
|
+
[status, headers, body]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def redact(s)
|
|
76
|
+
DataRedactor.redact(s, only: @only, except: @except, placeholder: @placeholder)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def scrub_request_headers(env)
|
|
80
|
+
SENSITIVE_REQUEST_HEADERS.each do |key|
|
|
81
|
+
value = env[key]
|
|
82
|
+
env[key] = redact(value) if value.is_a?(String) && !value.empty?
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def scrub_response_headers(headers)
|
|
87
|
+
# Rack 3 uses lower-case header names; Rack 2 uses Capitalized.
|
|
88
|
+
# Match case-insensitively against our known list.
|
|
89
|
+
sensitive_lc = SENSITIVE_RESPONSE_HEADERS.map(&:downcase)
|
|
90
|
+
headers.each_with_object({}) do |(key, value), out|
|
|
91
|
+
if sensitive_lc.include?(key.to_s.downcase)
|
|
92
|
+
out[key] = scrub_header_value(value)
|
|
93
|
+
else
|
|
94
|
+
out[key] = value
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def scrub_header_value(value)
|
|
100
|
+
case value
|
|
101
|
+
when String then redact(value)
|
|
102
|
+
when Array then value.map { |v| v.is_a?(String) ? redact(v) : v }
|
|
103
|
+
else value
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def wrap_body(body, headers)
|
|
108
|
+
# Buffer the body, redact, return as a single-element array.
|
|
109
|
+
# Stripping Content-Length because the redacted body may differ in
|
|
110
|
+
# byte length; downstream servers will recompute or chunk-encode.
|
|
111
|
+
buffered = +""
|
|
112
|
+
body.each { |chunk| buffered << chunk.to_s }
|
|
113
|
+
body.close if body.respond_to?(:close)
|
|
114
|
+
|
|
115
|
+
scrubbed = redact(buffered)
|
|
116
|
+
new_headers = headers.reject { |k, _| k.to_s.downcase == "content-length" }
|
|
117
|
+
[[scrubbed], new_headers]
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require "data_redactor"
|
|
2
|
+
|
|
3
|
+
module DataRedactor
|
|
4
|
+
module Integrations
|
|
5
|
+
# Rails `config.filter_parameters` adapter. Returns a `Proc` that Rails
|
|
6
|
+
# invokes with `(key, value)` for every leaf in the params tree; we redact
|
|
7
|
+
# the value in place when it is a String.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# # config/initializers/filter_parameter_logging.rb
|
|
11
|
+
# require "data_redactor/integrations/rails"
|
|
12
|
+
# Rails.application.config.filter_parameters += [
|
|
13
|
+
# DataRedactor::Integrations::Rails.filter
|
|
14
|
+
# ]
|
|
15
|
+
#
|
|
16
|
+
# @example Restricting to specific tags
|
|
17
|
+
# Rails.application.config.filter_parameters += [
|
|
18
|
+
# DataRedactor::Integrations::Rails.filter(only: [:credentials, :financial])
|
|
19
|
+
# ]
|
|
20
|
+
module Rails
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
# @param only forwarded to {DataRedactor.redact}
|
|
24
|
+
# @param except forwarded to {DataRedactor.redact}
|
|
25
|
+
# @param placeholder forwarded to {DataRedactor.redact}
|
|
26
|
+
# @return [Proc] a `(key, value)` proc compatible with `config.filter_parameters`
|
|
27
|
+
def filter(only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
|
|
28
|
+
lambda do |_key, value|
|
|
29
|
+
next unless value.is_a?(String)
|
|
30
|
+
# Rails' Parameter Filter mutates the value in place. We can't
|
|
31
|
+
# reassign `value` here, so use String#replace.
|
|
32
|
+
redacted = DataRedactor.redact(value, only: only, except: except, placeholder: placeholder)
|
|
33
|
+
value.replace(redacted) if redacted != value
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|