PyPI - scrufflehog - Versions diffs - 0.1.0__tar.gz - Mend

scrufflehog 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

scrufflehog-0.1.0/.github/workflows/release.yml +86 -0
scrufflehog-0.1.0/.gitignore +7 -0
scrufflehog-0.1.0/LICENSE +21 -0
scrufflehog-0.1.0/PKG-INFO +116 -0
scrufflehog-0.1.0/README.md +96 -0
scrufflehog-0.1.0/docs/AGENTIC.md +63 -0
scrufflehog-0.1.0/examples/scrufflehog.toml +53 -0
scrufflehog-0.1.0/pyproject.toml +34 -0
scrufflehog-0.1.0/src/scrufflehog/__init__.py +14 -0
scrufflehog-0.1.0/src/scrufflehog/advisor.py +39 -0
scrufflehog-0.1.0/src/scrufflehog/advisors/__init__.py +2 -0
scrufflehog-0.1.0/src/scrufflehog/advisors/llm.py +167 -0
scrufflehog-0.1.0/src/scrufflehog/cli.py +82 -0
scrufflehog-0.1.0/src/scrufflehog/config.py +55 -0
scrufflehog-0.1.0/src/scrufflehog/coverage/__init__.py +4 -0
scrufflehog-0.1.0/src/scrufflehog/coverage/extract.py +104 -0
scrufflehog-0.1.0/src/scrufflehog/coverage/semantics.py +19 -0
scrufflehog-0.1.0/src/scrufflehog/engine.py +134 -0
scrufflehog-0.1.0/src/scrufflehog/oracles.py +92 -0
scrufflehog-0.1.0/src/scrufflehog/output/__init__.py +7 -0
scrufflehog-0.1.0/src/scrufflehog/output/json_out.py +16 -0
scrufflehog-0.1.0/src/scrufflehog/output/sarif.py +77 -0
scrufflehog-0.1.0/src/scrufflehog/output/text.py +18 -0
scrufflehog-0.1.0/src/scrufflehog/probes.py +79 -0
scrufflehog-0.1.0/src/scrufflehog/runners/__init__.py +31 -0
scrufflehog-0.1.0/src/scrufflehog/runners/go_runner.py +71 -0
scrufflehog-0.1.0/src/scrufflehog/runners/node_runner.py +64 -0
scrufflehog-0.1.0/src/scrufflehog/runners/python_runner.py +58 -0
scrufflehog-0.1.0/src/scrufflehog/runners/rust_runner.py +67 -0
scrufflehog-0.1.0/tests/test_advisor.py +99 -0
scrufflehog-0.1.0/tests/test_live_runners.py +111 -0
scrufflehog-0.1.0/tests/test_scrufflehog.py +190 -0

scrufflehog-0.1.0/.github/workflows/release.yml ADDED Viewed

@@ -0,0 +1,86 @@
+name: Release to PyPI
+# Publishes scrufflehog to PyPI when a GitHub Release is published, using PyPI
+# Trusted Publishing (OIDC): the job mints a short-lived OpenID Connect token
+# that PyPI exchanges for upload rights. No API token is stored in the repo or
+# in Actions secrets.
+#
+# One-time PyPI setup (pending publisher, since the project doesn't exist yet):
+#   PyPI -> Your projects -> Publishing -> Add a pending publisher
+#     PyPI project name: scrufflehog
+#     Owner:             seanturner83
+#     Repository:        scrufflehog
+#     Workflow filename: release.yml
+#     Environment:       pypi
+# The first successful run creates the project and claims the name.
+on:
+  release:
+    types: [published]
+  # Manual trigger for re-runs / dry checks (still gated on the pypi environment).
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  build:
+    name: Build sdist + wheel
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Build
+        run: |
+          python -m pip install --upgrade build
+          python -m build
+      - name: Check metadata
+        run: |
+          python -m pip install --upgrade twine
+          twine check dist/*
+      - name: Verify tag matches package version
+        if: github.event_name == 'release'
+        run: |
+          # Release tag is v<version>; the built wheel must match, or we'd
+          # publish a version that disagrees with the tag people pinned.
+          tag="${GITHUB_REF_NAME#v}"
+          whl=$(ls dist/*.whl | head -1)
+          pkg_ver=$(basename "$whl" | sed -E 's/^scrufflehog-([^-]+)-.*/\1/')
+          echo "tag=$tag  wheel=$pkg_ver"
+          if [ "$tag" != "$pkg_ver" ]; then
+            echo "::error::release tag ($tag) != package version ($pkg_ver) — bump pyproject.toml or fix the tag"
+            exit 1
+          fi
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+  publish:
+    name: Publish to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    # The pypi environment is the second gate: Trusted Publishing on PyPI is
+    # scoped to this environment name, and it can carry a required-reviewer
+    # protection rule so a human approves each real publish.
+    environment:
+      name: pypi
+      url: https://pypi.org/p/scrufflehog
+    permissions:
+      id-token: write   # OIDC token for Trusted Publishing — the only privilege needed
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Publish
+        uses: pypa/gh-action-pypi-publish@release/v1
+        # No `with: password:` — auth is the OIDC identity of this job.

scrufflehog-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,7 @@
+__pycache__/
+*.pyc
+.pytest_cache/
+dist/
+build/
+*.egg-info/
+.venv/

scrufflehog-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Sean Turner
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

scrufflehog-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,116 @@
+Metadata-Version: 2.4
+Name: scrufflehog
+Version: 0.1.0
+Summary: Deterministically verify that your redactors actually redact.
+Project-URL: Homepage, https://github.com/seanturner83/scrufflehog
+Author: Sean Turner
+License: MIT
+License-File: LICENSE
+Keywords: logging,pii,redaction,secrets,security,testing
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Security
+Requires-Python: >=3.11
+Provides-Extra: agentic
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == 'dev'
+Description-Content-Type: text/markdown
+# scrufflehog
+**Unit-test your redaction.**
+Everyone scans for secrets that already leaked (trufflehog, gitleaks). Almost
+nobody tests whether the redaction they *rely on* actually works. scrufflehog is
+the inverse tool: it runs adversarial probes through your own redaction code and
+deterministically asserts the secret is gone — and checks that your field
+denylist/allow-list covers the sensitive names you think it does.
+No model, no guessing: every verdict is a hard assertion against a planted secret
+you control. Zero false positives by construction.
+## Two things it checks
+**1. Transform-strength** — *does the redactor's output still contain, or
+trivially reverse to, the secret?* It executes your redactor on planted probes
+and applies three oracles:
+- `literal_survival` — the secret appears verbatim in the output.
+- `noop_passthrough` — the "redactor" returned its input unchanged.
+- `reversible` — the output is a keyless, low-entropy transform (truncated or
+  unsalted hash, base64, static substitution) that a bounded candidate space
+  recovers. Catches "redaction" that only *looks* redacted.
+**2. Coverage** — *is every sensitive field name actually on your list?* A field
+denylist/allow-list is data, not behaviour, so scrufflehog extracts it straight
+from source and checks a sensitive-field corpus against it — **without executing
+your code**. This works across languages (Go maps, Python collections,
+pino/`fast-redact` path lists, Rust sets).
+## Languages
+| | transform-strength | coverage |
+|---|---|---|
+| Python | in-process import | ✓ |
+| Go | driver built in your module | ✓ (map literals) |
+| Rust | `cargo --example` driver | ✓ (set literals) |
+| Node/JS | node driver via stdin | ✓ (pino path lists) |
+## Install
+Latest from source (works today):
+```bash
+pip install git+https://github.com/seanturner83/scrufflehog
+```
+Once the first release is published, from PyPI:
+```bash
+pip install scrufflehog
+```
+## Use
+Write a `scrufflehog.toml` declaring your redactors (see `examples/`):
+```toml
+[[transform]]
+lang = "python"
+module = "app/redact.py"
+fn = "redact_value"
+kind = "value"
+[[coverage]]
+module = "app/redact.py"
+symbol = "SECRET_FIELDS"
+extract = "py_collection"
+match = "exact_ci"
+```
+Then:
+```bash
+scrufflehog verify --config scrufflehog.toml --target . --format text
+scrufflehog verify --config scrufflehog.toml --target . --format sarif   # for code-scanning
+scrufflehog verify --config scrufflehog.toml --target . --fail-on-defect # CI gate
+```
+## Deterministic by default; optional agentic assist
+The core is entirely deterministic and that's the point. An **optional** advisor
+(`--advisor llm`) can propose domain-matched probes, discover redactors, and
+confirm coverage hypotheses against real field usage — but it only ever supplies
+*inputs and hypotheses*; the deterministic oracle still renders every verdict.
+With no advisor, output is fully reproducible. See `docs/AGENTIC.md`.
+## Why "scrufflehog"
+trufflehog finds the secrets. scrufflehog scruffs through the code that's
+*supposed to hide them* and checks it actually does.
+## License
+MIT.

scrufflehog-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,96 @@
+# scrufflehog
+**Unit-test your redaction.**
+Everyone scans for secrets that already leaked (trufflehog, gitleaks). Almost
+nobody tests whether the redaction they *rely on* actually works. scrufflehog is
+the inverse tool: it runs adversarial probes through your own redaction code and
+deterministically asserts the secret is gone — and checks that your field
+denylist/allow-list covers the sensitive names you think it does.
+No model, no guessing: every verdict is a hard assertion against a planted secret
+you control. Zero false positives by construction.
+## Two things it checks
+**1. Transform-strength** — *does the redactor's output still contain, or
+trivially reverse to, the secret?* It executes your redactor on planted probes
+and applies three oracles:
+- `literal_survival` — the secret appears verbatim in the output.
+- `noop_passthrough` — the "redactor" returned its input unchanged.
+- `reversible` — the output is a keyless, low-entropy transform (truncated or
+  unsalted hash, base64, static substitution) that a bounded candidate space
+  recovers. Catches "redaction" that only *looks* redacted.
+**2. Coverage** — *is every sensitive field name actually on your list?* A field
+denylist/allow-list is data, not behaviour, so scrufflehog extracts it straight
+from source and checks a sensitive-field corpus against it — **without executing
+your code**. This works across languages (Go maps, Python collections,
+pino/`fast-redact` path lists, Rust sets).
+## Languages
+| | transform-strength | coverage |
+|---|---|---|
+| Python | in-process import | ✓ |
+| Go | driver built in your module | ✓ (map literals) |
+| Rust | `cargo --example` driver | ✓ (set literals) |
+| Node/JS | node driver via stdin | ✓ (pino path lists) |
+## Install
+Latest from source (works today):
+```bash
+pip install git+https://github.com/seanturner83/scrufflehog
+```
+Once the first release is published, from PyPI:
+```bash
+pip install scrufflehog
+```
+## Use
+Write a `scrufflehog.toml` declaring your redactors (see `examples/`):
+```toml
+[[transform]]
+lang = "python"
+module = "app/redact.py"
+fn = "redact_value"
+kind = "value"
+[[coverage]]
+module = "app/redact.py"
+symbol = "SECRET_FIELDS"
+extract = "py_collection"
+match = "exact_ci"
+```
+Then:
+```bash
+scrufflehog verify --config scrufflehog.toml --target . --format text
+scrufflehog verify --config scrufflehog.toml --target . --format sarif   # for code-scanning
+scrufflehog verify --config scrufflehog.toml --target . --fail-on-defect # CI gate
+```
+## Deterministic by default; optional agentic assist
+The core is entirely deterministic and that's the point. An **optional** advisor
+(`--advisor llm`) can propose domain-matched probes, discover redactors, and
+confirm coverage hypotheses against real field usage — but it only ever supplies
+*inputs and hypotheses*; the deterministic oracle still renders every verdict.
+With no advisor, output is fully reproducible. See `docs/AGENTIC.md`.
+## Why "scrufflehog"
+trufflehog finds the secrets. scrufflehog scruffs through the code that's
+*supposed to hide them* and checks it actually does.
+## License
+MIT.

scrufflehog-0.1.0/docs/AGENTIC.md ADDED Viewed

@@ -0,0 +1,63 @@
+# Optional agentic layer — design
+## Principle (non-negotiable)
+scrufflehog's core is **entirely deterministic** and stays that way. Every
+verdict — defect or clean — comes from a hard assertion against known ground
+truth (planted probe → run redactor → literal/hash-space/set-membership check).
+Same input, same answer, zero false positives, no model. That determinism is the
+product's whole thesis: the deterministic counterpart to probabilistic LLM
+review.
+The agentic layer is **optional, off by default, and never renders a verdict.**
+Rule: *the agent proposes INPUTS and HYPOTHESES; the deterministic oracle still
+decides.* This preserves the zero-FP guarantee while removing scrufflehog's two
+real blind spots (hand-authored probes, unconfirmed coverage hypotheses).
+## Where an advisor adds value (three seams)
+1. **Probe generation** — read the redactor's signature/body and generate
+   probes that match its INPUT DOMAIN. Solves the real footgun (feeding bare
+   values to a URL-path redactor → false positives until domain-matched probes
+   were hand-written). Generated probes are still run through the deterministic
+   oracle; the agent supplies inputs, not verdicts.
+2. **Redactor + field discovery** — scan a repo and PROPOSE the registry
+   (redaction fns, denylist symbols, sensitive field names). Human/config
+   confirms. Discovery, not judgement.
+3. **Coverage-gap confirmation** — the honest-caveat killer. A coverage finding
+   is a hypothesis ("the list lacks `ssn`" — but does a field named `ssn` reach
+   this redactor?). An advisor greps the codebase for real field usage to
+   confirm or refute, turning a hypothesis into a confirmed finding or dropping
+   it.
+## Interface
+```python
+class Advisor(Protocol):
+    def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]: ...
+    def discover_redactors(self, target: Path) -> list[dict]: ...
+    def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> Verdict: ...
+```
+- `NoopAdvisor` (default): `propose_probes` → [], `discover` → [], `confirm` →
+  UNCONFIRMED (finding stands as a hypothesis, exactly today's behaviour).
+- `LLMAdvisor` (optional module, extra dependency): implements the three via a
+  model. Lives behind `pip install scrufflehog[agentic]`.
+CLI:
+```
+scrufflehog verify --config x.toml --target .                 # pure deterministic
+scrufflehog verify --config x.toml --target . --advisor llm   # + agentic assist
+```
+## Invariants
+- A defect is ALWAYS confirmed by the deterministic oracle. The advisor can add
+  probes that trigger one, or downgrade a coverage hypothesis to
+  confirmed/refuted, but it cannot manufacture a defect the oracle didn't verify.
+- With `--advisor` absent, output is byte-identical to today. Reproducibility of
+  the deterministic path is a test invariant.
+- Advisor failures (timeout, API error) degrade to the deterministic result,
+  never crash the run.

scrufflehog-0.1.0/examples/scrufflehog.toml ADDED Viewed

@@ -0,0 +1,53 @@
+# Example scrufflehog config. Point module paths at your redactors (relative to
+# the --target checkout). Delete the languages you don't use.
+# --- transform-strength: execute the redactor on probes ---------------------
+[[transform]]
+lang = "python"
+module = "app/redact.py"
+fn = "redact_value"        # def redact_value(s: str) -> str
+kind = "value"
+probe_set = "value"        # "value" (any sensitive value) | "url_apikey" (URL-scoped)
+[[transform]]
+lang = "go"
+module = "internal/logredact"
+import = "github.com/example/app/internal/logredact"
+fn = "Sanitize"            # func Sanitize(s string) string
+kind = "value"
+# wrap = "error"           # set if the fn takes an error, not a string
+[[transform]]
+lang = "node"
+module = "dist/redact.js"
+fn = "redactValue"         # module.exports.redactValue = (s) => ...
+kind = "value"
+# export = "default"       # set if module.exports IS the fn
+[[transform]]
+lang = "rust"
+module = "."               # crate root (Cargo.toml dir)
+call = "app::redact::mask(&line)"
+# --- coverage: static field-list check (no execution) -----------------------
+[[coverage]]
+module = "app/redact.py"
+symbol = "SECRET_FIELDS"   # a Python list/tuple/set of field names
+extract = "py_collection"
+match = "exact_ci"
+# corpus = ["ssn", "cvv", "iban"]   # optional; omit to use the built-in list
+[[coverage]]
+module = "internal/logredact/keys.go"
+symbol = "SecretKeys"      # var SecretKeys = map[string]struct{}{ "password": {}, ... }
+extract = "go_map_keys"
+match = "exact_ci"
+doc_claims_substring = false
+[[coverage]]
+module = "src/logger.ts"
+symbol = "redact"          # pino redact: { paths: [ ... ] } or redact: [ ... ]
+extract = "ts_redact_paths"
+match = "exact_ci"

scrufflehog-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,34 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "scrufflehog"
+version = "0.1.0"
+description = "Deterministically verify that your redactors actually redact."
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.11"
+authors = [{ name = "Sean Turner" }]
+keywords = ["security", "redaction", "pii", "secrets", "logging", "testing"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Security",
+]
+dependencies = []   # core is stdlib-only (tomllib on 3.11+)
+[project.optional-dependencies]
+agentic = []        # LLM advisor deps go here (opt-in)
+dev = ["pytest>=7"]
+[project.scripts]
+scrufflehog = "scrufflehog.cli:main"
+[project.urls]
+Homepage = "https://github.com/seanturner83/scrufflehog"
+[tool.hatch.build.targets.wheel]
+packages = ["src/scrufflehog"]

scrufflehog-0.1.0/src/scrufflehog/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""scrufflehog — deterministically verify that your redactors actually redact.
+Everyone scans for secrets that leaked. scrufflehog tests whether the redaction
+you rely on actually works: it runs adversarial probes through your own redactor
+and asserts the secret is gone and not trivially reversible, and checks your
+field denylist/allow-list covers the sensitive names you think it does.
+"""
+from .oracles import Defect, assert_output, reversible
+from .probes import Probe, get_probe_set
+from .engine import RunResult, run
+__version__ = "0.1.0"
+__all__ = ["Defect", "assert_output", "reversible", "Probe", "get_probe_set",
+           "RunResult", "run"]

scrufflehog-0.1.0/src/scrufflehog/advisor.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Optional advisor interface — the ONLY place non-determinism may enter.
+An advisor proposes INPUTS and HYPOTHESES; it never renders a verdict. The
+deterministic oracle still decides every defect. The default NoopAdvisor makes
+the engine byte-identical to a pure deterministic run — the agentic layer is
+strictly additive and opt-in. See docs/AGENTIC.md.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+from .probes import Probe
+class CoverageVerdict:
+    CONFIRMED = "confirmed"      # a real field with this name reaches the redactor
+    REFUTED = "refuted"          # no such field is logged here — drop the finding
+    UNCONFIRMED = "unconfirmed"  # can't tell — finding stands as a hypothesis
+@runtime_checkable
+class Advisor(Protocol):
+    def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]: ...
+    def discover_redactors(self, target: Path) -> list[dict]: ...
+    def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> str: ...
+class NoopAdvisor:
+    """Default. Adds nothing; the run stays purely deterministic."""
+    def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]:
+        return []
+    def discover_redactors(self, target: Path) -> list[dict]:
+        return []
+    def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> str:
+        return CoverageVerdict.UNCONFIRMED

scrufflehog-0.1.0/src/scrufflehog/advisors/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Optional advisors. Import the concrete one you want; the core never imports
2	+ these, so the base package stays dependency-free and deterministic."""