data_redactor 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,160 @@
1
+ #include "redact.h"
2
+ #include "patterns.h"
3
+ #include "placeholder.h"
4
+ #include "custom_patterns.h"
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <stdio.h>
8
+
9
+ char *wrap_boundary(const char *core) {
10
+ const char *prefix = "(^|[^0-9A-Za-z])(";
11
+ const char *suffix = ")([^0-9A-Za-z]|$)";
12
+ size_t len = strlen(prefix) + strlen(core) + strlen(suffix) + 1;
13
+ char *buf = (char *)malloc(len);
14
+ if (!buf) return NULL;
15
+ snprintf(buf, len, "%s%s%s", prefix, core, suffix);
16
+ return buf;
17
+ }
18
+
19
+ /*
20
+ * Replace all occurrences of a compiled pattern in `input` with PLACEHOLDER.
21
+ *
22
+ * If `use_boundary` is non-zero the pattern was compiled as:
23
+ * (^|[^0-9A-Za-z])(CORE)([^0-9A-Za-z]|$)
24
+ * groups: [0]=full match [1]=left boundary [2]=CORE [3]=right boundary
25
+ * We pass nmatch=4 so the engine fills all four slots, then use matches[1].rm_eo
26
+ * and matches[3].rm_so to locate the exact CORE span. The boundary characters
27
+ * are copied back verbatim so they are not lost.
28
+ */
29
+ char *replace_all_matches(regex_t *pattern, const char *input,
30
+ int use_boundary, const placeholder_t *ph) {
31
+ size_t ph_max = max_placeholder_len(ph);
32
+ size_t out_cap = strlen(input) * 2 + 512;
33
+ char *output = (char *)malloc(out_cap);
34
+ if (!output) return NULL;
35
+
36
+ char *ph_buf = (char *)malloc(ph_max + 1);
37
+ if (!ph_buf) { free(output); return NULL; }
38
+
39
+ size_t out_len = 0;
40
+ const char *cursor = input;
41
+ regmatch_t matches[4];
42
+
43
+ while (regexec(pattern, cursor, 4, matches, 0) == 0) {
44
+ regoff_t full_so = matches[0].rm_so;
45
+ regoff_t full_eo = matches[0].rm_eo;
46
+
47
+ if (full_so < 0 || full_eo < full_so) break;
48
+
49
+ regoff_t core_so = full_so;
50
+ regoff_t core_eo = full_eo;
51
+
52
+ if (use_boundary) {
53
+ if (matches[1].rm_so >= 0 && matches[1].rm_eo > matches[1].rm_so)
54
+ core_so = matches[1].rm_eo;
55
+ if (matches[3].rm_so >= 0 && matches[3].rm_eo > matches[3].rm_so)
56
+ core_eo = matches[3].rm_so;
57
+ }
58
+
59
+ size_t prefix_len = (size_t)core_so;
60
+ size_t suffix_len = (size_t)(full_eo - core_eo);
61
+ size_t match_len = (size_t)(full_eo - full_so);
62
+ size_t core_len = (size_t)(core_eo - core_so);
63
+
64
+ size_t ph_len = write_placeholder(ph_buf, ph, cursor + core_so, core_len);
65
+
66
+ size_t needed = out_len + prefix_len + ph_len + suffix_len + strlen(cursor + full_eo) + 1;
67
+ if (needed > out_cap) {
68
+ out_cap = needed * 2;
69
+ char *tmp = (char *)realloc(output, out_cap);
70
+ if (!tmp) { free(output); free(ph_buf); return NULL; }
71
+ output = tmp;
72
+ }
73
+
74
+ memcpy(output + out_len, cursor, prefix_len);
75
+ out_len += prefix_len;
76
+
77
+ memcpy(output + out_len, ph_buf, ph_len);
78
+ out_len += ph_len;
79
+
80
+ if (suffix_len > 0) {
81
+ memcpy(output + out_len, cursor + core_eo, suffix_len);
82
+ out_len += suffix_len;
83
+ }
84
+
85
+ cursor += full_eo;
86
+
87
+ if (match_len == 0) {
88
+ if (*cursor) output[out_len++] = *cursor++;
89
+ else break;
90
+ }
91
+ }
92
+ free(ph_buf);
93
+
94
+ size_t tail_len = strlen(cursor);
95
+ size_t needed = out_len + tail_len + 1;
96
+ if (needed > out_cap) {
97
+ out_cap = needed;
98
+ char *tmp = (char *)realloc(output, out_cap);
99
+ if (!tmp) { free(output); return NULL; }
100
+ output = tmp;
101
+ }
102
+ memcpy(output + out_len, cursor, tail_len);
103
+ out_len += tail_len;
104
+ output[out_len] = '\0';
105
+
106
+ return output;
107
+ }
108
+
109
+ /* Look up the i-th entry of the enable_bits Array. Out-of-bounds → 0 (skip). */
110
+ static inline int enable_bit(VALUE rb_enable_bits, long i) {
111
+ if (i < 0 || i >= RARRAY_LEN(rb_enable_bits)) return 0;
112
+ VALUE v = rb_ary_entry(rb_enable_bits, i);
113
+ return RTEST(v) && NUM2INT(v) != 0;
114
+ }
115
+
116
+ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
117
+ VALUE rb_ph_mode, VALUE rb_ph_str,
118
+ VALUE rb_enable_bits) {
119
+ Check_Type(rb_text, T_STRING);
120
+ Check_Type(rb_ph_str, T_STRING);
121
+ Check_Type(rb_enable_bits, T_ARRAY);
122
+
123
+ int ph_mode = NUM2INT(rb_ph_mode);
124
+ const char *ph_str_plain = StringValueCStr(rb_ph_str);
125
+
126
+ const char *input = StringValueCStr(rb_text);
127
+ char *working = strdup(input);
128
+ if (!working) rb_raise(rb_eNoMemError, "strdup failed");
129
+
130
+ placeholder_t ph;
131
+ ph.mode = ph_mode;
132
+
133
+ for (int i = 0; i < NUM_PATTERNS; i++) {
134
+ if (!enable_bit(rb_enable_bits, i)) continue;
135
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
136
+ ? ph_str_plain
137
+ : tag_name_for_bit(pattern_tags[i]);
138
+ char *result = replace_all_matches(&compiled_patterns[i], working,
139
+ boundary_wrapped[i], &ph);
140
+ free(working);
141
+ if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
142
+ working = result;
143
+ }
144
+
145
+ for (int i = 0; i < custom_count; i++) {
146
+ if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
147
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
148
+ ? ph_str_plain
149
+ : tag_name_for_bit(custom_patterns[i].tag);
150
+ char *result = replace_all_matches(&custom_patterns[i].compiled, working,
151
+ custom_patterns[i].boundary, &ph);
152
+ free(working);
153
+ if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
154
+ working = result;
155
+ }
156
+
157
+ VALUE rb_result = rb_str_new_cstr(working);
158
+ free(working);
159
+ return rb_result;
160
+ }
@@ -0,0 +1,35 @@
1
+ #ifndef DATA_REDACTOR_REDACT_H
2
+ #define DATA_REDACTOR_REDACT_H
3
+
4
+ #include <ruby.h>
5
+ #include <regex.h>
6
+ #include "placeholder.h"
7
+
8
+ /*
9
+ * Build a boundary-wrapped version of a pattern:
10
+ * (^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)
11
+ * Caller must free the returned string.
12
+ */
13
+ char *wrap_boundary(const char *core);
14
+
15
+ /*
16
+ * Replace all occurrences of a compiled pattern in `input` with the placeholder.
17
+ * Returns a newly malloc'd string (caller must free), or NULL on allocation failure.
18
+ */
19
+ char *replace_all_matches(regex_t *pattern, const char *input,
20
+ int use_boundary, const placeholder_t *ph);
21
+
22
+ /*
23
+ * DataRedactor._redact(text, ph_mode, ph_str, enable_bits) -> String
24
+ *
25
+ * `enable_bits` is an Array of Integers, one per pattern index. The first
26
+ * NUM_PATTERNS entries cover built-ins; any extra entries cover currently
27
+ * registered custom patterns in registration order. A 0 entry skips that
28
+ * pattern entirely; non-zero runs it. The Ruby layer compiles `only:` /
29
+ * `except:` (mixed Symbol+String) into this array.
30
+ */
31
+ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
32
+ VALUE rb_ph_mode, VALUE rb_ph_str,
33
+ VALUE rb_enable_bits);
34
+
35
+ #endif
@@ -0,0 +1,131 @@
1
+ #include "scan.h"
2
+ #include "patterns.h"
3
+ #include "placeholder.h"
4
+ #include "custom_patterns.h"
5
+ #include "redact.h"
6
+ #include <regex.h>
7
+ #include <string.h>
8
+ #include <stdlib.h>
9
+
10
+ /*
11
+ * To map working-buffer positions back to original-string positions we
12
+ * maintain a log of every replacement already applied. Each entry records
13
+ * where in the *working* buffer the replacement started (after all prior
14
+ * replacements) and how many bytes were removed (orig_len) vs. inserted
15
+ * (always 10, the length of "[REDACTED]").
16
+ *
17
+ * For a new match at working position W:
18
+ * cumulative_shift_before_W = sum of (10 - orig_len) for all prior
19
+ * replacements whose working_pos <= W
20
+ * original_pos = W - cumulative_shift_before_W
21
+ */
22
+ /* Look up the i-th entry of the enable_bits Array. Out-of-bounds → 0 (skip). */
23
+ static inline int scan_enable_bit(VALUE rb_enable_bits, long i) {
24
+ if (i < 0 || i >= RARRAY_LEN(rb_enable_bits)) return 0;
25
+ VALUE v = rb_ary_entry(rb_enable_bits, i);
26
+ return RTEST(v) && NUM2INT(v) != 0;
27
+ }
28
+
29
+ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
30
+ Check_Type(rb_text, T_STRING);
31
+ Check_Type(rb_enable_bits, T_ARRAY);
32
+
33
+ const char *input = StringValueCStr(rb_text);
34
+
35
+ static const placeholder_t ph_default = { PLACEHOLDER_MODE_PLAIN, "[REDACTED]" };
36
+
37
+ char *working = strdup(input);
38
+ if (!working) rb_raise(rb_eNoMemError, "strdup failed");
39
+
40
+ VALUE matches_arr = rb_ary_new();
41
+
42
+ typedef struct { long wpos; long orig_len; } repl_t;
43
+ repl_t *repl_log = NULL;
44
+ int repl_count = 0;
45
+ int repl_cap = 0;
46
+
47
+ #define REPL_LOG_PUSH(_wpos, _olen) do { \
48
+ if (repl_count >= repl_cap) { \
49
+ int _nc = repl_cap == 0 ? 16 : repl_cap * 2; \
50
+ repl_t *_t = (repl_t *)realloc(repl_log, sizeof(repl_t) * _nc); \
51
+ if (!_t) { free(repl_log); free(working); rb_raise(rb_eNoMemError, "repl_log"); } \
52
+ repl_log = _t; repl_cap = _nc; \
53
+ } \
54
+ repl_log[repl_count].wpos = (_wpos); \
55
+ repl_log[repl_count].orig_len = (_olen); \
56
+ repl_count++; \
57
+ } while (0)
58
+
59
+ #define WORKING_TO_ORIG(_wpos) ({ \
60
+ long _shift = 0; \
61
+ for (int _ri = 0; _ri < repl_count; _ri++) { \
62
+ if (repl_log[_ri].wpos <= (_wpos)) \
63
+ _shift += 10 - repl_log[_ri].orig_len; \
64
+ } \
65
+ (_wpos) - _shift; \
66
+ })
67
+
68
+ #define COLLECT_AND_REPLACE(pat, use_bnd, tag_bit, pat_name) do { \
69
+ const char *_cur = working; \
70
+ regmatch_t _m[4]; \
71
+ while (regexec((pat), _cur, 4, _m, 0) == 0) { \
72
+ regoff_t _fso = _m[0].rm_so, _feo = _m[0].rm_eo; \
73
+ if (_fso < 0 || _feo < _fso) break; \
74
+ regoff_t _cso = _fso, _ceo = _feo; \
75
+ if (use_bnd) { \
76
+ if (_m[1].rm_so >= 0 && _m[1].rm_eo > _m[1].rm_so) \
77
+ _cso = _m[1].rm_eo; \
78
+ if (_m[3].rm_so >= 0 && _m[3].rm_eo > _m[3].rm_so) \
79
+ _ceo = _m[3].rm_so; \
80
+ } \
81
+ size_t _vlen = (size_t)(_ceo - _cso); \
82
+ long _wpos = (long)(_cur - working) + (long)_cso; \
83
+ long _orig = WORKING_TO_ORIG(_wpos); \
84
+ VALUE _match = rb_hash_new(); \
85
+ rb_hash_aset(_match, ID2SYM(rb_intern("tag")), \
86
+ ID2SYM(rb_intern(tag_name_for_bit(tag_bit)))); \
87
+ rb_hash_aset(_match, ID2SYM(rb_intern("name")), \
88
+ rb_str_new_cstr(pat_name)); \
89
+ rb_hash_aset(_match, ID2SYM(rb_intern("value")), \
90
+ rb_str_new(_cur + _cso, _vlen)); \
91
+ rb_hash_aset(_match, ID2SYM(rb_intern("start")), \
92
+ LONG2NUM(_orig)); \
93
+ rb_hash_aset(_match, ID2SYM(rb_intern("length")), \
94
+ LONG2NUM((long)_vlen)); \
95
+ rb_ary_push(matches_arr, _match); \
96
+ REPL_LOG_PUSH(_wpos, (long)_vlen); \
97
+ if (_feo == _fso) { if (*_cur) _cur++; else break; } \
98
+ else _cur += _feo; \
99
+ } \
100
+ char *_next = replace_all_matches((pat), working, (use_bnd), &ph_default); \
101
+ free(working); \
102
+ if (!_next) { free(repl_log); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); } \
103
+ working = _next; \
104
+ } while (0)
105
+
106
+ for (int i = 0; i < NUM_PATTERNS; i++) {
107
+ if (!scan_enable_bit(rb_enable_bits, i)) continue;
108
+ COLLECT_AND_REPLACE(&compiled_patterns[i], boundary_wrapped[i],
109
+ pattern_tags[i], pattern_names[i]);
110
+ }
111
+
112
+ for (int i = 0; i < custom_count; i++) {
113
+ if (!scan_enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
114
+ COLLECT_AND_REPLACE(&custom_patterns[i].compiled,
115
+ custom_patterns[i].boundary,
116
+ custom_patterns[i].tag, custom_patterns[i].name);
117
+ }
118
+
119
+ #undef COLLECT_AND_REPLACE
120
+ #undef WORKING_TO_ORIG
121
+ #undef REPL_LOG_PUSH
122
+
123
+ free(repl_log);
124
+
125
+ VALUE result = rb_hash_new();
126
+ VALUE rb_redacted = rb_str_new_cstr(working);
127
+ free(working);
128
+ rb_hash_aset(result, ID2SYM(rb_intern("redacted")), rb_redacted);
129
+ rb_hash_aset(result, ID2SYM(rb_intern("matches")), matches_arr);
130
+ return result;
131
+ }
@@ -0,0 +1,12 @@
1
+ #ifndef DATA_REDACTOR_SCAN_H
2
+ #define DATA_REDACTOR_SCAN_H
3
+
4
+ #include <ruby.h>
5
+
6
+ /*
7
+ * DataRedactor._scan(text, enable_bits) -> { redacted: String, matches: Array<Hash> }
8
+ * enable_bits: same per-pattern 0/1 array as _redact.
9
+ */
10
+ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits);
11
+
12
+ #endif
@@ -0,0 +1,24 @@
1
+ #ifndef DATA_REDACTOR_TAGS_H
2
+ #define DATA_REDACTOR_TAGS_H
3
+
4
+ /*
5
+ * Tag bits. Each pattern belongs to exactly one tag. Callers can pass a
6
+ * bitmask to restrict which patterns run (only / except). The default mask
7
+ * (TAG_ALL) runs every pattern and matches the historical behaviour of
8
+ * `redact(text)` with no second argument.
9
+ */
10
+ #define TAG_CREDENTIALS (1 << 0)
11
+ #define TAG_FINANCIAL (1 << 1)
12
+ #define TAG_TAX_ID (1 << 2)
13
+ #define TAG_NATIONAL_ID (1 << 3)
14
+ #define TAG_CONTACT (1 << 4)
15
+ #define TAG_NETWORK (1 << 5)
16
+ #define TAG_TRAVEL (1 << 6)
17
+ #define TAG_OTHER (1 << 7)
18
+ #define TAG_CUSTOM (1 << 8)
19
+ #define TAG_BUILTIN_ALL (TAG_CREDENTIALS | TAG_FINANCIAL | TAG_TAX_ID | \
20
+ TAG_NATIONAL_ID | TAG_CONTACT | TAG_NETWORK | \
21
+ TAG_TRAVEL | TAG_OTHER)
22
+ #define TAG_ALL (TAG_BUILTIN_ALL | TAG_CUSTOM)
23
+
24
+ #endif
@@ -1,3 +1,4 @@
1
1
  module DataRedactor
2
- VERSION = "0.5.0"
2
+ # Current gem version. Follows {https://semver.org Semantic Versioning 2.0.0}.
3
+ VERSION = "0.6.0"
3
4
  end