PyPI - tesserakit-api - Versions diffs - 0.3.1__py3-none-any.whl - Mend

tesserakit-api 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

tessera_api/__init__.py +3 -0
tessera_api/cli.py +45 -0
tessera_api/compiler.py +176 -0
tessera_api/curl.py +262 -0
tessera_api/loader.py +42 -0
tessera_api/pack.py +36 -0
tessera_api/redact.py +143 -0
tessera_api/schema.py +45 -0
tessera_api/validator.py +96 -0
tesserakit_api-0.3.1.dist-info/METADATA +69 -0
tesserakit_api-0.3.1.dist-info/RECORD +13 -0
tesserakit_api-0.3.1.dist-info/WHEEL +4 -0
tesserakit_api-0.3.1.dist-info/entry_points.txt +5 -0

tessera_api/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Tessera api pack."""
+__version__ = "0.3.1"

tessera_api/cli.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+from pathlib import Path
+import typer
+from rich.console import Console
+from rich.table import Table
+from tessera_core.models import RunContext
+from tessera_api.pack import ApiPack
+console = Console()
+api_app = typer.Typer(help="Parse curl/HTTP traces into a validated, redacted API surface map.")
+@api_app.command("compile")
+def compile_cmd(
+    input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="A .curl/.sh file or a directory of them."),
+    output: Path = typer.Option(Path("api_pack"), "--output", "-o", help="Output directory."),
+) -> None:
+    """Parse curl commands into canonical, secret-redacted API request records."""
+    ctx = RunContext(job_name="api", output_dir=output)
+    pack = ApiPack()
+    artifacts = pack.run(input_path=input, ctx=ctx, options={})
+    table = Table(title="API Pack Created")
+    table.add_column("Artifact")
+    table.add_column("Path")
+    table.add_column("Kind")
+    for art in artifacts:
+        table.add_row(art.name, str(art.path), art.kind)
+    console.print(table)
+    summary = Table(title="Run Summary")
+    summary.add_column("Metric")
+    summary.add_column("Value")
+    summary.add_row("run_id", ctx.run_id)
+    summary.add_row("records", str(ctx.metadata.get("record_count", 0)))
+    summary.add_row("findings", str(ctx.metadata.get("finding_count", 0)))
+    console.print(summary)
+def register(root_app: typer.Typer) -> None:
+    root_app.add_typer(api_app, name="api")

tessera_api/compiler.py ADDED Viewed

@@ -0,0 +1,176 @@
+from __future__ import annotations
+from collections import Counter
+from pathlib import Path
+from typing import Any
+from tessera_core.artifacts import write_jsonl, write_markdown
+from tessera_core.models import Artifact, RunContext, ValidationFinding
+from tessera_api.loader import load_api_records
+from tessera_api.schema import ApiRequest
+from tessera_api.validator import validate_api_records
+def load_records(input_path: Path, options: dict[str, Any]) -> list[ApiRequest]:
+    return load_api_records(input_path, options)
+def validate_records(records: list[ApiRequest], options: dict[str, Any]) -> list[ValidationFinding]:
+    findings: list[ValidationFinding] = []
+    for err in options.get("_parse_errors", []):
+        findings.append(
+            ValidationFinding(
+                severity="error",
+                code="parse_error",
+                message=f"failed to parse curl: {err['error']} (near: {err.get('preview', '')})",
+                field=None,
+                metadata={"source_file": err.get("source_file", "")},
+            )
+        )
+    findings.extend(validate_api_records(records))
+    return findings
+def write_artifacts(
+    records: list[ApiRequest],
+    ctx: RunContext,
+    options: dict[str, Any],
+) -> list[Artifact]:
+    ctx.output_dir.mkdir(parents=True, exist_ok=True)
+    findings: list[ValidationFinding] = (
+        ctx.metadata.get("findings") or validate_records(records, options)
+    )
+    index_jsonl = ctx.output_dir / "index.jsonl"
+    index_md = ctx.output_dir / "index.md"
+    validation_md = ctx.output_dir / "validation_report.md"
+    coverage_md = ctx.output_dir / "coverage_report.md"
+    redactions_md = ctx.output_dir / "redactions_report.md"
+    write_jsonl(index_jsonl, [r.model_dump() for r in records])
+    write_markdown(index_md, _render_index(records))
+    write_markdown(validation_md, _render_validation(records, findings, options))
+    write_markdown(coverage_md, _render_coverage(records))
+    write_markdown(redactions_md, _render_redactions(records))
+    return [
+        Artifact(name="index.jsonl", path=index_jsonl, kind="jsonl"),
+        Artifact(name="index.md", path=index_md, kind="markdown"),
+        Artifact(name="validation_report.md", path=validation_md, kind="markdown"),
+        Artifact(name="coverage_report.md", path=coverage_md, kind="markdown"),
+        Artifact(name="redactions_report.md", path=redactions_md, kind="markdown"),
+    ]
+def _render_index(records: list[ApiRequest]) -> str:
+    lines = ["# API Request Catalog", ""]
+    lines.append(f"- Total requests: {len(records)}")
+    lines.append("")
+    if not records:
+        lines.append("_No requests found._")
+        return "\n".join(lines) + "\n"
+    lines.append("| ID | Method | Host | Path | Auth | Body | Redactions |")
+    lines.append("|---|---|---|---|---|---|---:|")
+    for r in records:
+        lines.append(
+            f"| `{r.id}` | {r.method} | {r.host} | {r.path} "
+            f"| {r.auth.kind} | {r.body_kind} | {len(r.redactions)} |"
+        )
+    lines.append("")
+    return "\n".join(lines)
+def _render_validation(
+    records: list[ApiRequest],
+    findings: list[ValidationFinding],
+    options: dict[str, Any],
+) -> str:
+    lines = ["# Validation Report", ""]
+    lines.append(f"- Total requests: {len(records)}")
+    lines.append(f"- Findings: {len(findings)}")
+    lines.append(f"- Parse errors: {len(options.get('_parse_errors', []))}")
+    lines.append("")
+    by_severity = Counter(f.severity for f in findings)
+    lines.append("## Severity Breakdown")
+    lines.append("")
+    for sev in ("error", "warning", "info"):
+        lines.append(f"- {sev}: {by_severity.get(sev, 0)}")
+    lines.append("")
+    if findings:
+        lines.append("## Findings")
+        lines.append("")
+        for f in findings[:200]:
+            ident = f.metadata.get("id", "") if f.metadata else ""
+            who = f" `{ident}`" if ident else ""
+            field_part = f" [{f.field}]" if f.field else ""
+            lines.append(f"- **{f.severity.upper()}** `{f.code}`{who}{field_part}: {f.message}")
+        if len(findings) > 200:
+            lines.append(f"- ... {len(findings) - 200} more findings omitted")
+    return "\n".join(lines)
+def _render_coverage(records: list[ApiRequest]) -> str:
+    lines = ["# Coverage Report", ""]
+    lines.append(f"- Total requests: {len(records)}")
+    if not records:
+        return "\n".join(lines) + "\n"
+    method_dist = Counter(r.method for r in records)
+    host_dist = Counter(r.host for r in records)
+    auth_dist = Counter(r.auth.kind for r in records)
+    insecure = sum(1 for r in records if r.scheme == "http")
+    lines.append(f"- Insecure (http) requests: {insecure}")
+    lines.append("")
+    lines.append("## Methods")
+    lines.append("")
+    for method, count in method_dist.most_common():
+        lines.append(f"- `{method}`: {count}")
+    lines.append("")
+    lines.append("## Hosts")
+    lines.append("")
+    for host, count in host_dist.most_common():
+        lines.append(f"- `{host}`: {count}")
+    lines.append("")
+    lines.append("## Auth kinds")
+    lines.append("")
+    for kind, count in auth_dist.most_common():
+        lines.append(f"- `{kind}`: {count}")
+    return "\n".join(lines) + "\n"
+def _render_redactions(records: list[ApiRequest]) -> str:
+    lines = ["# Redactions Report", ""]
+    total = sum(len(r.redactions) for r in records)
+    lines.append(f"- Total redactions: {total}")
+    lines.append("")
+    lines.append("Every secret below was masked before any artifact was written. "
+                 "Previews reveal at most a couple of leading characters and the length.")
+    lines.append("")
+    if total == 0:
+        lines.append("_No secrets detected._")
+        return "\n".join(lines) + "\n"
+    kind_dist: Counter[str] = Counter()
+    for r in records:
+        for red in r.redactions:
+            kind_dist[red.kind] += 1
+    lines.append("## By kind")
+    lines.append("")
+    for kind, count in kind_dist.most_common():
+        lines.append(f"- `{kind}`: {count}")
+    lines.append("")
+    lines.append("## Detail")
+    lines.append("")
+    lines.append("| Request | Location | Kind | Preview |")
+    lines.append("|---|---|---|---|")
+    for r in records:
+        for red in r.redactions:
+            lines.append(f"| `{r.id}` | {red.location} | {red.kind} | `{red.preview}` |")
+    return "\n".join(lines) + "\n"

tessera_api/curl.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Parse curl commands into canonical, redacted ApiRequest records."""
+from __future__ import annotations
+import shlex
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+from tessera_api.redact import (
+    _TOKEN_PATTERNS,
+    auth_token_value,
+    classify_header_secret,
+    detect_secret_shape,
+    is_secret_header,
+    is_secret_query,
+    mask,
+)
+from tessera_api.schema import ApiAuth, ApiRequest, Redaction
+def split_curl_commands(text: str) -> list[str]:
+    """Split a file's text into individual curl command strings.
+    Line continuations (trailing backslash) are joined first; then each block
+    that begins with a ``curl`` token starts a new command.
+    """
+    joined = text.replace("\\\n", " ")
+    commands: list[str] = []
+    current: list[str] = []
+    for raw_line in joined.splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        starts = line.split(None, 1)[0] == "curl" if line.split() else False
+        if starts and current:
+            commands.append(" ".join(current))
+            current = [line]
+        else:
+            current.append(line)
+    if current:
+        commands.append(" ".join(current))
+    return [c for c in commands if c.strip().startswith("curl")]
+def parse_curl(command: str, record_id: str) -> ApiRequest:
+    """Parse a single curl command string into a redacted ApiRequest.
+    Raises ValueError if the command cannot be tokenized or has no URL.
+    """
+    try:
+        tokens = shlex.split(command)
+    except ValueError as exc:
+        raise ValueError(f"cannot tokenize curl command: {exc}") from exc
+    if not tokens or tokens[0] != "curl":
+        raise ValueError("not a curl command")
+    method: str | None = None
+    url: str | None = None
+    headers: dict[str, str] = {}
+    redactions: list[Redaction] = []
+    auth = ApiAuth()
+    body: str | None = None
+    body_kind = "none"
+    basic_user_pass: str | None = None
+    i = 1
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok in ("-X", "--request"):
+            method = tokens[i + 1] if i + 1 < len(tokens) else method
+            i += 2
+            continue
+        if tok in ("-H", "--header"):
+            raw = tokens[i + 1] if i + 1 < len(tokens) else ""
+            _ingest_header(raw, headers, redactions, auth)
+            i += 2
+            continue
+        if tok in ("-u", "--user"):
+            basic_user_pass = tokens[i + 1] if i + 1 < len(tokens) else ""
+            i += 2
+            continue
+        if tok in ("-d", "--data", "--data-raw", "--data-binary", "--data-ascii"):
+            raw_body = tokens[i + 1] if i + 1 < len(tokens) else ""
+            body_kind = "json" if _looks_json(raw_body) else "form"
+            body, body_redactions = _redact_body(raw_body)
+            redactions.extend(body_redactions)
+            i += 2
+            continue
+        if tok in ("--url",):
+            url = tokens[i + 1] if i + 1 < len(tokens) else url
+            i += 2
+            continue
+        if tok in ("--compressed", "-s", "--silent", "-L", "--location", "-k", "--insecure", "-i", "--include", "-v", "--verbose", "-g", "--globoff"):
+            i += 1
+            continue
+        if tok.startswith("-"):
+            # Unknown flag; skip it and a value if the next token is not a URL.
+            if i + 1 < len(tokens) and not _is_url(tokens[i + 1]) and not tokens[i + 1].startswith("-"):
+                i += 2
+            else:
+                i += 1
+            continue
+        # positional: treat as URL
+        if url is None and _is_url(tok):
+            url = tok
+        i += 1
+    if url is None:
+        raise ValueError("no URL found in curl command")
+    # Basic auth via -u
+    if basic_user_pass is not None:
+        user = basic_user_pass.split(":", 1)[0]
+        auth = ApiAuth(kind="basic", location="flag:-u", present=True)
+        redactions.append(
+            Redaction(location="flag:-u", kind="basic_credentials", preview=f"{mask(user)} : (password redacted)")
+        )
+    scheme, host, path, redacted_query, query_map, url_redactions = _split_and_redact_url(url)
+    redactions.extend(url_redactions)
+    # If a secret query param looks like auth and no header auth was found.
+    if auth.kind == "none":
+        for qname in query_map:
+            if qname.lower() in ("api_key", "apikey", "key", "access_token", "token"):
+                auth = ApiAuth(kind="api_key_query", location=f"query:{qname}", present=True)
+                break
+    if method is None:
+        method = "POST" if body is not None else "GET"
+    redacted_url = urlunsplit((scheme, host, path, redacted_query, ""))
+    return ApiRequest(
+        id=record_id,
+        method=method.upper(),
+        url=redacted_url,
+        scheme=scheme,
+        host=host,
+        path=path,
+        query=query_map,
+        headers=headers,
+        body=body,  # already redacted in the -d handler
+        body_kind=body_kind,
+        auth=auth,
+        redactions=redactions,
+        # Note: the raw command is never stored; it contains the unredacted
+        # secrets we just stripped. Only a safe synthesized summary is kept.
+        metadata={"summary": f"{method.upper()} {host}{path}"},
+    )
+def _ingest_header(raw: str, headers: dict[str, str], redactions: list[Redaction], auth: ApiAuth) -> None:
+    if ":" not in raw:
+        headers[raw.strip()] = ""
+        return
+    name, value = raw.split(":", 1)
+    name = name.strip()
+    value = value.strip()
+    if is_secret_header(name):
+        kind = classify_header_secret(name, value)
+        cred = auth_token_value(value)
+        redactions.append(Redaction(location=f"header:{name.lower()}", kind=kind, preview=mask(cred)))
+        headers[name] = "(redacted)"
+        if kind == "bearer_token":
+            auth.kind = "bearer"
+            auth.location = f"header:{name}"
+            auth.present = True
+        elif kind == "basic_credentials":
+            auth.kind = "basic"
+            auth.location = f"header:{name}"
+            auth.present = True
+        elif kind == "api_key":
+            auth.kind = "api_key_header"
+            auth.location = f"header:{name}"
+            auth.present = True
+    else:
+        # not a known secret-named header: still screen the value by shape
+        shape = detect_secret_shape(value)
+        if shape:
+            redactions.append(Redaction(location=f"header:{name.lower()}", kind=shape, preview=mask(value)))
+            headers[name] = "(redacted)"
+        else:
+            headers[name] = value
+def _split_and_redact_url(url: str):
+    parts = urlsplit(url)
+    query_pairs = parse_qsl(parts.query, keep_blank_values=True)
+    redactions: list[Redaction] = []
+    redacted_pairs: list[tuple[str, str]] = []
+    query_map: dict[str, str] = {}
+    for k, v in query_pairs:
+        if is_secret_query(k):
+            redactions.append(Redaction(location=f"query:{k}", kind="api_key", preview=mask(v)))
+            redacted_pairs.append((k, "(redacted)"))
+            query_map[k] = "(redacted)"
+        else:
+            shape = detect_secret_shape(v)
+            if shape:
+                redactions.append(Redaction(location=f"query:{k}", kind=shape, preview=mask(v)))
+                redacted_pairs.append((k, "(redacted)"))
+                query_map[k] = "(redacted)"
+            else:
+                redacted_pairs.append((k, v))
+                query_map[k] = v
+    redacted_query = urlencode(redacted_pairs)
+    return parts.scheme, parts.netloc, parts.path, redacted_query, query_map, redactions
+def _redact_body(body: str) -> tuple[str, list[Redaction]]:
+    """Redact secret-keyed fields in a JSON-ish or form body, reporting each.
+    Returns the redacted body and a Redaction per field masked, so body
+    secrets appear in the audit trail like header and query secrets do.
+    """
+    import re
+    redactions: list[Redaction] = []
+    secret_keys = r"password|passwd|pwd|secret|client_secret|token|access_token|api_key|apikey"
+    def json_sub(m: "re.Match[str]") -> str:
+        key, value = m.group(1), m.group(3)
+        redactions.append(Redaction(location=f"body:{key.lower()}", kind="body_secret", preview=mask(value)))
+        return f'{m.group(2)}(redacted)"'
+    def form_sub(m: "re.Match[str]") -> str:
+        key, value = m.group(1), m.group(2)
+        redactions.append(Redaction(location=f"body:{key.lower()}", kind="body_secret", preview=mask(value)))
+        return f"{key}=(redacted)"
+    redacted = re.sub(
+        rf'"({secret_keys})"(\s*:\s*")([^"]*)"',
+        json_sub,
+        body,
+        flags=re.IGNORECASE,
+    )
+    redacted = re.sub(
+        rf"\b({secret_keys})=([^&\s]+)",
+        form_sub,
+        redacted,
+        flags=re.IGNORECASE,
+    )
+    # shape-based pass: provider tokens embedded anywhere in the body text,
+    # regardless of the surrounding key name (catches secrets in odd fields)
+    for kind, pat in _TOKEN_PATTERNS:
+        def tok_sub(m: "re.Match[str]", _kind: str = kind) -> str:
+            redactions.append(Redaction(location="body", kind=_kind, preview=mask(m.group(0))))
+            return "(redacted)"
+        redacted = pat.sub(tok_sub, redacted)
+    return redacted, redactions
+def _looks_json(body: str) -> bool:
+    s = body.strip()
+    return s.startswith("{") or s.startswith("[")
+def _is_url(token: str) -> bool:
+    return token.startswith("http://") or token.startswith("https://")

tessera_api/loader.py ADDED Viewed

@@ -0,0 +1,42 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from tessera_api.curl import parse_curl, split_curl_commands
+from tessera_api.schema import ApiRequest
+def discover_curl_files(root: Path) -> list[Path]:
+    """Find curl-bearing files: ``*.curl`` and ``*.sh`` (or a single file)."""
+    if root.is_file():
+        return [root]
+    found: list[Path] = []
+    for path in sorted(root.rglob("*")):
+        if path.is_file() and path.suffix in (".curl", ".sh"):
+            found.append(path)
+    return found
+def load_api_records(input_path: Path, options: dict[str, Any]) -> list[ApiRequest]:
+    """Parse every curl command in the input into redacted ApiRequest records."""
+    files = discover_curl_files(input_path)
+    records: list[ApiRequest] = []
+    parse_errors: list[dict[str, str]] = []
+    seq = 0
+    for path in files:
+        text = path.read_text(encoding="utf-8")
+        for cmd in split_curl_commands(text):
+            seq += 1
+            rid = f"{path.stem}_{seq}"
+            try:
+                rec = parse_curl(cmd, rid)
+                rec.metadata["source_file"] = str(path)
+                records.append(rec)
+            except ValueError as exc:
+                parse_errors.append({"source_file": str(path), "error": str(exc), "preview": cmd[:80]})
+    options["_parse_errors"] = parse_errors
+    options["_input_path"] = str(input_path)
+    return records

tessera_api/pack.py ADDED Viewed

@@ -0,0 +1,36 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from tessera_core.jobpack import JobPack
+from tessera_core.models import Artifact, RunContext, ValidationFinding
+from tessera_api.compiler import load_records, validate_records, write_artifacts
+class ApiPack(JobPack):
+    name = "api"
+    version = "0.3.1"
+    def normalize(self, input_path: Path, options: dict[str, Any]) -> list[Any]:
+        return load_records(input_path, options)
+    def validate(
+        self,
+        records: list[Any],
+        options: dict[str, Any],
+    ) -> list[ValidationFinding]:
+        return validate_records(records, options)
+    def generate(
+        self,
+        records: list[Any],
+        ctx: RunContext,
+        options: dict[str, Any],
+    ) -> list[Artifact]:
+        return write_artifacts(records, ctx, options)
+def create_pack() -> ApiPack:
+    return ApiPack()

tessera_api/redact.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Secret detection and masking.
+The contract for this module: given a raw value that may be a secret, return a
+masked preview that reveals at most a few leading characters and never the tail.
+All redaction happens before a value is written into an ``ApiRequest``; the
+canonical record and every artifact hold only masked previews.
+"""
+from __future__ import annotations
+import math
+import re
+# Header names whose values are always treated as secret.
+SECRET_HEADER_NAMES = {
+    "authorization",
+    "proxy-authorization",
+    "x-api-key",
+    "api-key",
+    "apikey",
+    "x-auth-token",
+    "x-auth",
+    "x-access-token",
+    "x-secret",
+    "x-amz-security-token",
+    "cookie",
+    "set-cookie",
+}
+# Query parameter names whose values are always treated as secret.
+SECRET_QUERY_NAMES = {
+    "api_key",
+    "apikey",
+    "key",
+    "token",
+    "access_token",
+    "auth",
+    "auth_token",
+    "secret",
+    "client_secret",
+    "password",
+    "passwd",
+    "pwd",
+    "sig",
+    "signature",
+    "sas",
+}
+_MASK = "(redacted"
+def mask(value: str, lead: int = 2) -> str:
+    """Return a masked preview: a few leading chars plus length, never the tail."""
+    value = value or ""
+    n = len(value)
+    if n == 0:
+        return "(redacted, empty)"
+    if n <= lead:
+        return f"…(redacted, len={n})"
+    return f"{value[:lead]}…(redacted, len={n})"
+def is_secret_header(name: str) -> bool:
+    return name.strip().lower() in SECRET_HEADER_NAMES
+def is_secret_query(name: str) -> bool:
+    return name.strip().lower() in SECRET_QUERY_NAMES
+def classify_header_secret(name: str, value: str) -> str:
+    """Return a redaction kind label for a secret header value."""
+    lname = name.strip().lower()
+    if lname in ("authorization", "proxy-authorization"):
+        low = value.strip().lower()
+        if low.startswith("bearer "):
+            return "bearer_token"
+        if low.startswith("basic "):
+            return "basic_credentials"
+        return "authorization_value"
+    if lname in ("cookie", "set-cookie"):
+        return "cookie"
+    return "api_key"
+def auth_token_value(value: str) -> str:
+    """Strip the scheme prefix (Bearer/Basic) so we mask only the credential."""
+    m = re.match(r"^\s*(bearer|basic)\s+(.*)$", value, re.IGNORECASE)
+    if m:
+        return m.group(2)
+    return value
+# --- shape-based secret detection -------------------------------------------
+# High-confidence provider token patterns: (kind, pattern). These catch secrets
+# regardless of the field name they appear in.
+_TOKEN_PATTERNS: list[tuple[str, re.Pattern]] = [
+    ("aws_access_key_id", re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b")),
+    ("github_token", re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,}\b")),
+    ("github_pat", re.compile(r"\bgithub_pat_[A-Za-z0-9_]{40,}\b")),
+    ("slack_token", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
+    ("stripe_key", re.compile(r"\b(?:sk|rk|pk)_(?:live|test)_[A-Za-z0-9]{16,}\b")),
+    ("google_api_key", re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b")),
+    ("openai_key", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
+    ("jwt", re.compile(r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b")),
+    ("private_key_block", re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH |PGP )?PRIVATE KEY-----")),
+]
+def _shannon_entropy(s: str) -> float:
+    if not s:
+        return 0.0
+    counts: dict[str, int] = {}
+    for ch in s:
+        counts[ch] = counts.get(ch, 0) + 1
+    n = len(s)
+    return -sum((c / n) * math.log2(c / n) for c in counts.values())
+def detect_secret_shape(value: str) -> str | None:
+    """Return a secret-kind label if the value *looks* like a secret, else None.
+    First tries precise provider patterns, then a conservative high-entropy
+    heuristic for long, space-free, mixed-charset tokens.
+    """
+    v = (value or "").strip()
+    if not v:
+        return None
+    for kind, pat in _TOKEN_PATTERNS:
+        if pat.search(v):
+            return kind
+    # Common non-secret identifiers that would otherwise look high-entropy.
+    if _UUID_RE.fullmatch(v):
+        return None
+    # entropy fallback: long, no spaces, looks token-ish (not a sentence/URL/path)
+    if len(v) >= 24 and " " not in v and "/" not in v and not v.startswith(("http://", "https://")):
+        token_chars = re.fullmatch(r"[A-Za-z0-9+/=_\-\.]+", v)
+        if token_chars and _shannon_entropy(v) >= 3.5:
+            return "high_entropy_value"
+    return None
+_UUID_RE = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")

tessera_api/schema.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+AuthKind = Literal["bearer", "basic", "api_key_header", "api_key_query", "none"]
+BodyKind = Literal["json", "form", "text", "none"]
+class Redaction(BaseModel):
+    """A record of one secret that was removed before canonicalization.
+    ``preview`` holds a masked hint only (never the full secret), so the
+    redactions report is safe to commit and review.
+    """
+    location: str  # e.g. "header:authorization", "query:api_key", "body"
+    kind: str  # e.g. "bearer_token", "basic_credentials", "api_key"
+    preview: str  # e.g. "sk-ab…(redacted, len=51)"
+class ApiAuth(BaseModel):
+    kind: AuthKind = "none"
+    location: str = ""  # e.g. "header:Authorization", "query:api_key"
+    present: bool = False
+class ApiRequest(BaseModel):
+    """Canonical, secret-free API request record. Serialized to ``index.jsonl``."""
+    id: str
+    method: str = "GET"
+    url: str = ""  # redacted form (query secrets masked)
+    scheme: str = ""
+    host: str = ""
+    path: str = ""
+    query: dict[str, str] = Field(default_factory=dict)  # redacted values
+    headers: dict[str, str] = Field(default_factory=dict)  # redacted values
+    body: str | None = None  # redacted
+    body_kind: BodyKind = "none"
+    auth: ApiAuth = Field(default_factory=ApiAuth)
+    redactions: list[Redaction] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
+    metadata: dict[str, Any] = Field(default_factory=dict)

tessera_api/validator.py ADDED Viewed

@@ -0,0 +1,96 @@
+from __future__ import annotations
+from collections import Counter
+from tessera_core.models import ValidationFinding
+from tessera_api.redact import SECRET_HEADER_NAMES, SECRET_QUERY_NAMES
+from tessera_api.schema import ApiRequest
+def validate_api_records(records: list[ApiRequest]) -> list[ValidationFinding]:
+    findings: list[ValidationFinding] = []
+    for r in records:
+        findings.extend(_validate_one(r))
+    # Cross-record: duplicate method+url+body
+    seen: dict[tuple[str, str, str | None], int] = Counter()
+    for r in records:
+        seen[(r.method, r.url, r.body)] += 1
+    for (method, url, _body), count in seen.items():
+        if count > 1:
+            findings.append(
+                ValidationFinding(
+                    severity="info",
+                    code="duplicate_request",
+                    message=f"{count} identical requests: {method} {url}",
+                    field=None,
+                    metadata={"method": method, "url": url, "count": count},
+                )
+            )
+    # Cross-record: surface multiple hosts (not an error, just visibility)
+    hosts = sorted({r.host for r in records if r.host})
+    if len(hosts) > 1:
+        findings.append(
+            ValidationFinding(
+                severity="info",
+                code="multiple_hosts",
+                message=f"requests span {len(hosts)} hosts: {', '.join(hosts)}",
+                field="host",
+                metadata={"hosts": hosts},
+            )
+        )
+    return findings
+def _validate_one(r: ApiRequest) -> list[ValidationFinding]:
+    findings: list[ValidationFinding] = []
+    src = r.metadata.get("source_file", "")
+    def f(severity: str, code: str, message: str, field: str | None = None) -> ValidationFinding:
+        return ValidationFinding(
+            severity=severity, code=code, message=message, field=field,
+            metadata={"id": r.id, "source_file": src},
+        )
+    if r.scheme == "http":
+        findings.append(f("warning", "insecure_scheme",
+                          f"{r.method} {r.host}{r.path} uses http; credentials and data are sent in cleartext",
+                          "scheme"))
+    if not r.host:
+        findings.append(f("error", "missing_host", "request has no host", "host"))
+    # A secret in the query string is worse than in a header: URLs get logged.
+    query_redactions = [red for red in r.redactions if red.location.startswith("query:")]
+    if query_redactions:
+        names = ", ".join(red.location.split(":", 1)[1] for red in query_redactions)
+        findings.append(f("warning", "secret_in_url_query",
+                          f"secret(s) in URL query ({names}); URLs are commonly logged, prefer a header",
+                          "query"))
+    if not r.auth.present:
+        findings.append(f("info", "no_auth_detected",
+                          f"{r.method} {r.host}{r.path} has no detectable auth", "auth"))
+    # A secret found by shape in a field whose NAME is not a known secret name
+    # is high-signal: a custom auth header or a token hiding in an odd field.
+    for red in r.redactions:
+        loc = red.location
+        if loc.startswith("header:"):
+            name = loc.split(":", 1)[1]
+            if name not in SECRET_HEADER_NAMES:
+                findings.append(f("warning", "secret_in_nonstandard_location",
+                                  f"a {red.kind} was detected in header '{name}', which is not a conventional secret header",
+                                  "headers"))
+        elif loc.startswith("query:"):
+            name = loc.split(":", 1)[1]
+            if name not in SECRET_QUERY_NAMES:
+                findings.append(f("warning", "secret_in_nonstandard_location",
+                                  f"a {red.kind} was detected in query param '{name}', which is not a conventional secret name",
+                                  "query"))
+    return findings

tesserakit_api-0.3.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,69 @@
+Metadata-Version: 2.4
+Name: tesserakit-api
+Version: 0.3.1
+Summary: API job pack for Tessera: parse curl/HTTP traces into a validated, secret-redacted API surface map.
+Author: Tessera
+Classifier: Development Status :: 3 - Alpha
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.10
+Requires-Dist: pydantic>=2.7
+Requires-Dist: rich>=13.7
+Requires-Dist: tesserakit-core>=0.1.0
+Requires-Dist: typer>=0.12
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# tessera-api
+Turn messy curl commands and HTTP traces into a validated, secret-redacted API surface map.
+`tessera-api` reads `.curl` / `.sh` files containing curl commands, parses each into a canonical `ApiRequest`, **redacts every secret at parse time**, profiles the API surface, and emits a catalog plus reports — including a redactions audit.
+## Scope (v0.1)
+This pack parses and canonicalizes. It does **not** execute HTTP requests. Live calling, batch execution, and streaming response capture are runtime concerns with network side effects and are intentionally deferred to a later version. v0.1 is the offline, side-effect-free "what does this API surface look like, and does it leak secrets" pass.
+## Secret safety
+Redaction happens before a value is ever written into an `ApiRequest`. The canonical records and every artifact hold only masked previews (a couple of leading characters plus a length, never the tail). Secrets are detected by:
+- known secret header names (`Authorization`, `X-Api-Key`, `Cookie`, ...)
+- known secret query parameter names (`api_key`, `token`, `access_token`, `signature`, ...)
+- `-u user:pass` basic-auth flags
+- secret-ish keys inside request bodies (`password`, `client_secret`, `token`, ...)
+- **secret *shape* (v0.2)** — values that look like secrets regardless of field name: AWS keys (`AKIA…`), GitHub tokens (`ghp_…`), Slack/Stripe/Google/OpenAI keys, JWTs, private-key blocks, and high-entropy token strings. This catches secrets hiding in custom auth headers, odd query params, or body fields, and raises `secret_in_nonstandard_location` so you know a credential is somewhere unexpected. UUIDs and other common identifiers are excluded to avoid false positives.
+## Compile an API pack
+```bash
+tessera api compile --input examples/api/ --output ./out/api_pack
+```
+Artifacts written:
+```text
+index.jsonl              canonical, redacted ApiRequest rows
+index.md                 human-readable catalog (method, host, path, auth, redactions)
+validation_report.md     hygiene findings
+coverage_report.md       method / host / auth-kind distribution
+redactions_report.md     every redaction made, with masked previews (audit trail)
+```
+## Validation rules
+Per-request:
+- `insecure_scheme` — uses `http://` (cleartext)
+- `missing_host` — no host could be parsed
+- `secret_in_url_query` — a secret was found in the URL query (URLs get logged; prefer a header)
+- `no_auth_detected` — no auth credential was found
+Cross-request:
+- `duplicate_request` — identical method + url + body seen more than once
+- `multiple_hosts` — requests span more than one host (visibility, not an error)
+Plus `parse_error` for any curl command that cannot be tokenized or has no URL.

tesserakit_api-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+tessera_api/__init__.py,sha256=ioJRPCmtTsAbgOjBEcedthn8s5zvBurDgPDJHu_UsOA,47
+tessera_api/cli.py,sha256=dMWYZzU38cSpU_48Me8f97lmTpa7gjdW0EanXUTsHr0,1504
+tessera_api/compiler.py,sha256=0bsWAg8fld-xS2S8-56FlvnJ2onwTLbwzgyIUqe2vXU,6407
+tessera_api/curl.py,sha256=6hI21FAuychnJ7qk-0PB0Co_1zQe19Cfio1q-mMUO44,9459
+tessera_api/loader.py,sha256=gZ15mIv2ynRXsFM80co9f9lmo5dwp6jOTQwPoSAXjC8,1429
+tessera_api/pack.py,sha256=hcZR-qCIhlnCDoPEifmQ6U7yooqL8bF2gsuawWVICD0,905
+tessera_api/redact.py,sha256=3EAKS_TGyK76stOjp8_lTyFbldZ7J1oIvN6knJGqLW4,4542
+tessera_api/schema.py,sha256=pIj5mOT_oVIg3uTcZsIaHyYDTWAWBmRL1vbfjVp6AQE,1543
+tessera_api/validator.py,sha256=Q-vqUBLguCFmwlTyrAMXmN-6TDTewHepflJbZBnpiA0,3918
+tesserakit_api-0.3.1.dist-info/METADATA,sha256=p78eBa17flFKm41MLYZDOm8vZwoxW-EKI9hZPkBrAw8,3343
+tesserakit_api-0.3.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+tesserakit_api-0.3.1.dist-info/entry_points.txt,sha256=9jtB5v6G5Uz9lxvaGpBPLFqSv6vH2aR1_Jq_U-L2PKw,105
+tesserakit_api-0.3.1.dist-info/RECORD,,

tesserakit_api-0.3.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

tesserakit_api-0.3.1.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,5 @@
+[tessera.commands]
+api = tessera_api.cli:register
+[tessera.jobpacks]
+api = tessera_api.pack:create_pack