RubyGems - data_redactor - Versions diffs - 0.5.0 - Mend

data_redactor 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +67 -0
data/LICENSE +21 -0
data/ext/data_redactor/data_redactor.c +1047 -0
data/ext/data_redactor/extconf.rb +8 -0
data/lib/data_redactor/version.rb +3 -0
data/lib/data_redactor.rb +145 -0
data/readme.md +269 -0
metadata +87 -0

data/ext/data_redactor/extconf.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "mkmf"
+abort "Missing C compiler or stdio.h" unless have_header("stdio.h")
+abort "Missing regex.h"               unless have_header("regex.h")
+abort "Missing stdlib.h"              unless have_header("stdlib.h")
+abort "Missing string.h"              unless have_header("string.h")
+create_makefile("data_redactor/data_redactor")

data/lib/data_redactor/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module DataRedactor
+  VERSION = "0.5.0"
+end

data/lib/data_redactor.rb ADDED Viewed

@@ -0,0 +1,145 @@
+require_relative "data_redactor/version"
+require_relative "data_redactor/data_redactor" # loads the compiled .so
+module DataRedactor
+  TAGS = {
+    credentials: TAG_CREDENTIALS,
+    financial:   TAG_FINANCIAL,
+    tax_id:      TAG_TAX_ID,
+    national_id: TAG_NATIONAL_ID,
+    contact:     TAG_CONTACT,
+    network:     TAG_NETWORK,
+    travel:      TAG_TRAVEL,
+    other:       TAG_OTHER,
+    custom:      TAG_CUSTOM
+  }.freeze
+  class UnknownTagError     < ArgumentError; end
+  class InvalidPatternError < ArgumentError; end
+  # Capture groups break boundary-wrapper group index assumptions ([1],[2],[3] shift).
+  CAPTURE_GROUP_RE = /(?<!\\)\((?!\?:)/.freeze
+  # Ruby regex syntax that has no POSIX ERE equivalent.
+  RUBY_ONLY_SYNTAX_RE = /\\[dDwWsShHbB]|\(\?[<!=]|\(\?<[a-zA-Z]|\(\?[imx]|[*+?]\?/.freeze
+  PLACEHOLDER_DEFAULT = "[REDACTED]"
+  module_function
+  def tags
+    TAGS.keys
+  end
+  def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
+    raise ArgumentError, "pass only: or except:, not both" if only && except
+    mask =
+      if only
+        bits_for(only)
+      elsif except
+        TAG_ALL & ~bits_for(except)
+      else
+        TAG_ALL
+      end
+    ph_mode, ph_str = resolve_placeholder(placeholder)
+    _redact(text, mask, ph_mode, ph_str)
+  end
+  # Scan text without necessarily redacting it.
+  #
+  # Returns { redacted: String, matches: [{tag:, name:, value:, start:, length:}, ...] }
+  # The :tag value is a Symbol matching one of DataRedactor.tags.
+  # :start and :length are byte offsets into the original string.
+  def scan(text, only: nil, except: nil)
+    raise ArgumentError, "pass only: or except:, not both" if only && except
+    mask =
+      if only
+        bits_for(only)
+      elsif except
+        TAG_ALL & ~bits_for(except)
+      else
+        TAG_ALL
+      end
+    result = _scan(text, mask)
+    # Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS
+    result[:matches].each do |m|
+      m[:tag] = m[:tag].to_s.downcase.to_sym
+    end
+    result
+  end
+  # Add (or replace) a custom redaction pattern.
+  #
+  # name:     unique identifier string
+  # regex:    String (POSIX ERE) or Regexp; Ruby-only syntax raises InvalidPatternError
+  # tag:      one of the TAGS keys (default :custom), or any built-in tag
+  # boundary: wrap with word-boundary guards; incompatible with capture groups
+  def add_pattern(name:, regex:, tag: :custom, boundary: false)
+    raise ArgumentError, "name must be a non-empty String" \
+      unless name.is_a?(String) && !name.empty?
+    source = case regex
+             when String then regex
+             when Regexp then regex.source
+             else raise ArgumentError, "regex must be a String or Regexp, got #{regex.class}"
+             end
+    if source =~ RUBY_ONLY_SYNTAX_RE
+      raise InvalidPatternError,
+        "pattern #{name.inspect} uses Ruby-only syntax (#{$&.inspect}); " \
+        "use POSIX ERE — no \\d, \\s, \\w, \\b, lookaround, non-greedy, or named groups"
+    end
+    if boundary && source =~ CAPTURE_GROUP_RE
+      raise InvalidPatternError,
+        "pattern #{name.inspect} has capture groups and cannot use boundary: true"
+    end
+    tag_bit = TAGS[tag] or raise UnknownTagError,
+      "unknown tag #{tag.inspect}; valid tags: #{TAGS.keys.inspect}"
+    _add_pattern(name, source, tag_bit, boundary ? 1 : 0)
+  end
+  def remove_pattern(name)
+    _remove_pattern(name.to_s)
+  end
+  def custom_patterns
+    _custom_patterns.map do |h|
+      { name: h[:name], source: h[:source], tag: TAGS.key(h[:tag_bit]) || :custom,
+        boundary: h[:boundary] }
+    end
+  end
+  def clear_custom_patterns!
+    _clear_custom_patterns
+  end
+  def bits_for(tag_list)
+    Array(tag_list).inject(0) do |acc, tag|
+      bit = TAGS[tag] or raise UnknownTagError,
+        "unknown tag #{tag.inspect}; valid tags: #{TAGS.keys.inspect}"
+      acc | bit
+    end
+  end
+  # Returns [ph_mode_int, ph_str] for the C layer.
+  #   placeholder: "***"      -> plain string
+  #   placeholder: :tagged    -> "[REDACTED:TAGNAME]"
+  #   placeholder: :hash      -> "[TAGNAME_xxxx]"
+  def resolve_placeholder(placeholder)
+    case placeholder
+    when :tagged then [PH_MODE_TAGGED, ""]
+    when :hash   then [PH_MODE_HASH,   ""]
+    when String  then [PH_MODE_PLAIN,  placeholder]
+    else
+      raise ArgumentError,
+        "placeholder must be a String, :tagged, or :hash — got #{placeholder.inspect}"
+    end
+  end
+end

data/readme.md ADDED Viewed

@@ -0,0 +1,269 @@
+# DataRedactor
+A Ruby gem with a C extension for high-performance regex-based redaction of sensitive data from strings.
+## What it does
+DataRedactor scans text for sensitive patterns and replaces matches with `[REDACTED]`. It uses a C extension backed by POSIX `regex.h` so the heavy lifting happens outside the Ruby VM, making it fast enough for large payloads.
+## Usage
+```ruby
+require "data_redactor"
+text = "User CF is RSSMRA85M01H501Z and key is AKIAIOSFODNN7EXAMPLE"
+DataRedactor.redact(text)
+# => "User CF is [REDACTED] and key is [REDACTED]"
+```
+### Filtering by tag
+Every pattern belongs to one tag. Use `only:` to redact a subset, or `except:` to skip one.
+```ruby
+DataRedactor.tags
+# => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other]
+# Only redact API keys / tokens / private keys
+DataRedactor.redact(text, only: [:credentials])
+# Redact everything except contact info (emails, phone numbers)
+DataRedactor.redact(text, except: [:contact])
+# Single symbol works too
+DataRedactor.redact(text, only: :financial)
+```
+Passing an unknown tag raises `DataRedactor::UnknownTagError`. Passing both `only:` and `except:` raises `ArgumentError`.
+### Configurable placeholder
+By default every match is replaced with `[REDACTED]`. Use the `placeholder:` keyword to change this:
+```ruby
+# Plain string — any replacement text
+DataRedactor.redact(text, placeholder: "***")
+DataRedactor.redact(text, placeholder: "")
+# Tagged — embeds the pattern's tag name so you know what was redacted
+DataRedactor.redact(text, placeholder: :tagged)
+# "user@example.com"  → "[REDACTED:CONTACT]"
+# "AKIAIOSFODNN7EXAMPLE" → "[REDACTED:CREDENTIALS]"
+# "DE89370400440532013000" → "[REDACTED:FINANCIAL]"
+# Hash — deterministic 4-hex suffix of the matched value
+# Same value always produces the same token — useful for correlating
+# redactions across log lines without leaking the original.
+DataRedactor.redact(text, placeholder: :hash)
+# "user@example.com"  → "[CONTACT_3d7a]"
+# "user@example.com"  → "[CONTACT_3d7a]"  (same every time)
+# "other@example.com" → "[CONTACT_91fc]"  (different value, different hash)
+```
+All three modes compose with `only:` and `except:`:
+```ruby
+DataRedactor.redact(text, only: :contact, placeholder: :tagged)
+```
+### Scan / dry-run mode
+`DataRedactor.scan` returns every match alongside the redacted string — useful for auditing, tuning false positives, and compliance pipelines:
+```ruby
+result = DataRedactor.scan("User AKIAIOSFODNN7EXAMPLE logged in from 192.168.1.1")
+# => {
+#   redacted: "User [REDACTED] logged in from [REDACTED]",
+#   matches: [
+#     { tag: :credentials, name: "aws_access_key_id", value: "AKIAIOSFODNN7EXAMPLE", start: 5,  length: 20 },
+#     { tag: :network,     name: "ipv4",              value: "192.168.1.1",          start: 35, length: 11 }
+#   ]
+# }
+# :start and :length are byte offsets into the original string
+m = result[:matches].first
+original_text.byteslice(m[:start], m[:length])  # => "AKIAIOSFODNN7EXAMPLE"
+# Accepts the same tag filters as redact
+DataRedactor.scan(text, only: :credentials)
+DataRedactor.scan(text, except: :network)
+```
+### Custom patterns
+Teams often have internal IDs that the gem can't ship. Register them at boot:
+```ruby
+# String (POSIX ERE) or Regexp — both accepted
+DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
+DataRedactor.add_pattern(name: "ticket_ref",  regex: /TICKET-[A-Z]{2}[0-9]{4}/, boundary: true)
+# Custom patterns are tagged :custom by default; pass any built-in tag to group differently
+DataRedactor.add_pattern(name: "internal_key", regex: "INT-[A-Z]{3}", tag: :credentials)
+DataRedactor.redact(text)                         # runs all patterns including custom
+DataRedactor.redact(text, only: [:custom])         # only user patterns
+DataRedactor.redact(text, only: [:custom, :credentials]) # mix
+DataRedactor.custom_patterns   # => [{name:, source:, tag:, boundary:}, ...]
+DataRedactor.remove_pattern("employee_id")
+DataRedactor.clear_custom_patterns!               # mostly for test suites
+```
+**Regex rules** — patterns must be POSIX ERE (the same engine used for built-ins). Not supported: `\d`, `\s`, `\w`, `\b`, lookahead/lookbehind, non-greedy quantifiers, named groups. Violations raise `DataRedactor::InvalidPatternError` at registration time, never at redaction time. Use `[0-9]` instead of `\d`, `[[:space:]]` instead of `\s`, etc.
+**`boundary: true`** — wraps the pattern with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` so it only fires when the token is not embedded in a longer alphanumeric string. Incompatible with patterns that contain capture groups.
+## Detected patterns (49 total)
+### Cloud & API secrets
+| # | Pattern | Example |
+|---|---|---|
+| 0 | AWS Access Key ID | `AKIAIOSFODNN7EXAMPLE` |
+| 1 | AWS Secret Access Key | 40-character base64 string |
+| 5 | Google API Key | `AIzaSyXXXX...` |
+| 6 | GitHub Personal Access Token | `github_pat_XXXX...` |
+| 7 | Slack Webhook URL | `https://hooks.slack.com/services/T.../B.../...` |
+| 8 | Stripe Secret Key | `sk_live_XXXX...` |
+| 9 | PEM Private Key header | `-----BEGIN RSA PRIVATE KEY-----` |
+| 13 | Scaleway Access Key | `SCW12345ABCDE6789FGHIJ` |
+| 14 | UUID v4 / Scaleway Secret Key | `550e8400-e29b-41d4-a716-446655440000` |
+### Travel documents
+| # | Pattern | Example |
+|---|---|---|
+| 2 | Italian Codice Fiscale (basic) | `RSSMRA85M01H501Z` |
+| 3 | Passport — letter prefix + digits | `AB1234567` |
+| 4 | Passport — 9 consecutive digits ¹ | `123456789` |
+| 22 | Italian Codice Fiscale (omocodia) | `RSSMRALPMNLH5LMZ` |
+### Payment & network
+| # | Pattern | Example |
+|---|---|---|
+| 11 | Credit card — Visa, Mastercard, Amex, Discover, JCB | `4111111111111111` |
+| 12 | IPv4 address | `192.168.1.100` |
+### IBANs
+| # | Country | Example |
+|---|---|---|
+| 10 | Italy | `IT60X0542811101000000123456` |
+| 15 | France | `FR7630006000011234567890189` |
+| 16 | Germany | `DE89370400440532013000` |
+| 17 | Spain | `ES9121000418450200051332` |
+| 18 | Netherlands | `NL91ABNA0417164300` |
+| 19 | Belgium | `BE68539007547034` |
+| 20 | Portugal | `PT50000201231234567890154` |
+| 21 | Ireland | `IE29AIBK93115212345678` |
+| 28 | Sweden | `SE4550000000058398257466` |
+| 29 | Denmark | `DK5000400440116243` |
+| 30 | Norway | `NO9386011117947` |
+| 31 | Finland | `FI2112345600000785` |
+| 37 | Poland | `PL61109010140000071219812874` |
+| 38 | Austria | `AT611904300234573201` |
+| 39 | Switzerland | `CH9300762011623852957` |
+| 40 | Czechia | `CZ6508000000192000145399` |
+| 41 | Hungary | `HU42117730161111101800000000` |
+| 42 | Romania | `RO49AAAA1B31007593840000` |
+### National personal identifiers
+| # | Country | Type | Example |
+|---|---|---|---|
+| 23 | France | NIR / Social Security ¹ | `185126203450342` |
+| 24 | Spain | DNI ¹ | `12345678Z` |
+| 25 | Spain | NIE | `X1234567L` |
+| 26 | Netherlands | BSN ¹ | `123456789` |
+| 27 | Poland | PESEL ¹ | `85121612345` |
+| 32 | Belgium | National Number ¹ | `85121612345` |
+| 33 | Sweden | Personnummer ¹ | `850101-1234` |
+| 34 | Denmark | CPR Number ¹ | `010185-1234` |
+| 35 | Norway | Fødselsnummer ¹ | `01018512345` |
+| 36 | Finland | HETU ¹ | `010185-123A` |
+| 43 | Poland | PESEL (alt slot) ¹ | `90010112345` |
+| 44 | Austria | Abgabenkontonummer ¹ | `123456789` |
+| 45 | Switzerland | AHV Number ¹ | `756.1234.5678.90` |
+| 46 | Czechia | Rodné číslo ¹ | `856121/1234` |
+| 47 | Hungary | Tax ID ¹ | `8012345678` |
+| 48 | Romania | CNP ¹ | `1850101123456` |
+> ¹ **Word-boundary protected** — these patterns are wrapped with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` at compile time so they do not fire when the digit sequence appears inside a longer alphanumeric token.
+## Directory structure
+```
+redactor/
+├── data_redactor.gemspec
+├── Gemfile
+├── Rakefile
+├── lib/
+│   ├── data_redactor.rb          # Ruby entry point, loads the .so
+│   └── data_redactor/
+│       └── version.rb
+├── ext/
+│   └── data_redactor/
+│       ├── extconf.rb         # Checks for C headers, generates Makefile
+│       └── data_redactor.c       # C extension: regex compilation + redaction
+└── spec/
+    └── data_redactor_spec.rb     # RSpec tests (61 examples, one per pattern)
+```
+## Requirements
+- Ruby >= 2.7
+- A C compiler (`gcc` or `clang`)
+- POSIX `regex.h` (standard on Linux and macOS)
+## Setup
+```bash
+bundle install
+```
+## Compile the C extension
+```bash
+bundle exec rake compile
+```
+This runs `extconf.rb` via `rake-compiler`, which generates a `Makefile` and compiles `data_redactor.c` into a `.so` shared library placed under `lib/data_redactor/`.
+## Run the tests
+```bash
+bundle exec rake spec
+```
+Or compile and test in one step:
+```bash
+bundle exec rake
+```
+## How it works
+1. At load time, `Init_data_redactor` compiles all 49 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
+2. `DataRedactor.redact(text)` receives a Ruby `String`, converts it to a C `char*` via `StringValueCStr`, and runs each compiled pattern in sequence on a working buffer.
+3. For each pattern, `replace_all_matches` iterates using `regexec`, copies non-matching segments to a fresh output buffer, and inserts `[REDACTED]` in place of each match. For boundary-wrapped patterns, `regexec` is called with `nmatch=4` and sub-match groups `[1]`/`[3]` identify the boundary characters so they are preserved verbatim.
+4. The output buffer is grown with `realloc` as needed. After all patterns are applied the result is returned as a Ruby `String` via `rb_str_new_cstr`. All intermediate `malloc`/`strdup` allocations are explicitly `free`d.
+## Memory management
+All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before the function returns. The only Ruby-managed allocation is the final return value from `rb_str_new_cstr`. No Ruby objects are created mid-processing, so GC cannot collect anything out from under the C code.
+## Versioning
+This project follows [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html). Until `1.0.0`, minor versions may introduce breaking changes; from `1.0.0` onward, breaking changes will only land in major versions. See [CHANGELOG.md](CHANGELOG.md) for the release history.
+## License
+Released under the [MIT License](LICENSE).
+## Known limitations
+- **Pattern ordering matters** — patterns run sequentially. An early broad pattern (e.g. the 9-digit passport) may consume digits that a later pattern (e.g. credit card) depends on. Boundary wrapping mitigates this for pure-digit patterns.
+- **AWS Secret Key (pattern 1)** — 40 consecutive base64 characters is a broad match. It can produce false positives in base64-encoded content such as embedded images or binary blobs.
+- **Duplicate digit patterns** — several national ID formats share the same digit-length (11 digits: PESEL, Norwegian Fødselsnummer, Belgian National Number). They are kept as separate slots for clarity but the practical effect is that any 11-digit boundary-delimited number will be redacted.

metadata ADDED Viewed

@@ -0,0 +1,87 @@
+--- !ruby/object:Gem::Specification
+name: data_redactor
+version: !ruby/object:Gem::Version
+  version: 0.5.0
+platform: ruby
+authors:
+- Daniele Frisanco
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2026-05-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.12'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.12'
+description: A Ruby gem with a C extension for high-performance scanning and redaction
+  of 79 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
+  phone numbers, and PII from 15+ countries. Designed to sanitize text before sending
+  to LLMs, logging systems, or any public/third-party API.
+email:
+- daniele.frisanco@gmail.com
+executables: []
+extensions:
+- ext/data_redactor/extconf.rb
+extra_rdoc_files: []
+files:
+- CHANGELOG.md
+- LICENSE
+- ext/data_redactor/data_redactor.c
+- ext/data_redactor/extconf.rb
+- lib/data_redactor.rb
+- lib/data_redactor/version.rb
+- readme.md
+homepage: https://github.com/danielefrisanco/data_redactor
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/danielefrisanco/data_redactor
+  source_code_uri: https://github.com/danielefrisanco/data_redactor
+  changelog_uri: https://github.com/danielefrisanco/data_redactor/blob/main/CHANGELOG.md
+  bug_tracker_uri: https://github.com/danielefrisanco/data_redactor/issues
+  rubygems_mfa_required: 'true'
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '2.7'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.22
+signing_key:
+specification_version: 4
+summary: Redact PII and secrets from strings before sending to AI or external services
+test_files: []