tesserakit-api 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tesserakit_api-0.3.1/.gitignore +8 -0
- tesserakit_api-0.3.1/PKG-INFO +69 -0
- tesserakit_api-0.3.1/README.md +51 -0
- tesserakit_api-0.3.1/pyproject.toml +35 -0
- tesserakit_api-0.3.1/src/tessera_api/__init__.py +3 -0
- tesserakit_api-0.3.1/src/tessera_api/cli.py +45 -0
- tesserakit_api-0.3.1/src/tessera_api/compiler.py +176 -0
- tesserakit_api-0.3.1/src/tessera_api/curl.py +262 -0
- tesserakit_api-0.3.1/src/tessera_api/loader.py +42 -0
- tesserakit_api-0.3.1/src/tessera_api/pack.py +36 -0
- tesserakit_api-0.3.1/src/tessera_api/redact.py +143 -0
- tesserakit_api-0.3.1/src/tessera_api/schema.py +45 -0
- tesserakit_api-0.3.1/src/tessera_api/validator.py +96 -0
- tesserakit_api-0.3.1/tests/fixtures/unparseable.curl +5 -0
- tesserakit_api-0.3.1/tests/test_api_pack.py +244 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tesserakit-api
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: API job pack for Tessera: parse curl/HTTP traces into a validated, secret-redacted API surface map.
|
|
5
|
+
Author: Tessera
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Environment :: Console
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: pydantic>=2.7
|
|
12
|
+
Requires-Dist: rich>=13.7
|
|
13
|
+
Requires-Dist: tesserakit-core>=0.1.0
|
|
14
|
+
Requires-Dist: typer>=0.12
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# tessera-api
|
|
20
|
+
|
|
21
|
+
Turn messy curl commands and HTTP traces into a validated, secret-redacted API surface map.
|
|
22
|
+
|
|
23
|
+
`tessera-api` reads `.curl` / `.sh` files containing curl commands, parses each into a canonical `ApiRequest`, **redacts every secret at parse time**, profiles the API surface, and emits a catalog plus reports — including a redactions audit.
|
|
24
|
+
|
|
25
|
+
## Scope (v0.1)
|
|
26
|
+
|
|
27
|
+
This pack parses and canonicalizes. It does **not** execute HTTP requests. Live calling, batch execution, and streaming response capture are runtime concerns with network side effects and are intentionally deferred to a later version. v0.1 is the offline, side-effect-free "what does this API surface look like, and does it leak secrets" pass.
|
|
28
|
+
|
|
29
|
+
## Secret safety
|
|
30
|
+
|
|
31
|
+
Redaction happens before a value is ever written into an `ApiRequest`. The canonical records and every artifact hold only masked previews (a couple of leading characters plus a length, never the tail). Secrets are detected by:
|
|
32
|
+
|
|
33
|
+
- known secret header names (`Authorization`, `X-Api-Key`, `Cookie`, ...)
|
|
34
|
+
- known secret query parameter names (`api_key`, `token`, `access_token`, `signature`, ...)
|
|
35
|
+
- `-u user:pass` basic-auth flags
|
|
36
|
+
- secret-ish keys inside request bodies (`password`, `client_secret`, `token`, ...)
|
|
37
|
+
- **secret *shape* (v0.2)** — values that look like secrets regardless of field name: AWS keys (`AKIA…`), GitHub tokens (`ghp_…`), Slack/Stripe/Google/OpenAI keys, JWTs, private-key blocks, and high-entropy token strings. This catches secrets hiding in custom auth headers, odd query params, or body fields, and raises `secret_in_nonstandard_location` so you know a credential is somewhere unexpected. UUIDs and other common identifiers are excluded to avoid false positives.
|
|
38
|
+
|
|
39
|
+
## Compile an API pack
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
tessera api compile --input examples/api/ --output ./out/api_pack
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Artifacts written:
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
index.jsonl canonical, redacted ApiRequest rows
|
|
49
|
+
index.md human-readable catalog (method, host, path, auth, redactions)
|
|
50
|
+
validation_report.md hygiene findings
|
|
51
|
+
coverage_report.md method / host / auth-kind distribution
|
|
52
|
+
redactions_report.md every redaction made, with masked previews (audit trail)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Validation rules
|
|
56
|
+
|
|
57
|
+
Per-request:
|
|
58
|
+
|
|
59
|
+
- `insecure_scheme` — uses `http://` (cleartext)
|
|
60
|
+
- `missing_host` — no host could be parsed
|
|
61
|
+
- `secret_in_url_query` — a secret was found in the URL query (URLs get logged; prefer a header)
|
|
62
|
+
- `no_auth_detected` — no auth credential was found
|
|
63
|
+
|
|
64
|
+
Cross-request:
|
|
65
|
+
|
|
66
|
+
- `duplicate_request` — identical method + url + body seen more than once
|
|
67
|
+
- `multiple_hosts` — requests span more than one host (visibility, not an error)
|
|
68
|
+
|
|
69
|
+
Plus `parse_error` for any curl command that cannot be tokenized or has no URL.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# tessera-api
|
|
2
|
+
|
|
3
|
+
Turn messy curl commands and HTTP traces into a validated, secret-redacted API surface map.
|
|
4
|
+
|
|
5
|
+
`tessera-api` reads `.curl` / `.sh` files containing curl commands, parses each into a canonical `ApiRequest`, **redacts every secret at parse time**, profiles the API surface, and emits a catalog plus reports — including a redactions audit.
|
|
6
|
+
|
|
7
|
+
## Scope (v0.1)
|
|
8
|
+
|
|
9
|
+
This pack parses and canonicalizes. It does **not** execute HTTP requests. Live calling, batch execution, and streaming response capture are runtime concerns with network side effects and are intentionally deferred to a later version. v0.1 is the offline, side-effect-free "what does this API surface look like, and does it leak secrets" pass.
|
|
10
|
+
|
|
11
|
+
## Secret safety
|
|
12
|
+
|
|
13
|
+
Redaction happens before a value is ever written into an `ApiRequest`. The canonical records and every artifact hold only masked previews (a couple of leading characters plus a length, never the tail). Secrets are detected by:
|
|
14
|
+
|
|
15
|
+
- known secret header names (`Authorization`, `X-Api-Key`, `Cookie`, ...)
|
|
16
|
+
- known secret query parameter names (`api_key`, `token`, `access_token`, `signature`, ...)
|
|
17
|
+
- `-u user:pass` basic-auth flags
|
|
18
|
+
- secret-ish keys inside request bodies (`password`, `client_secret`, `token`, ...)
|
|
19
|
+
- **secret *shape* (v0.2)** — values that look like secrets regardless of field name: AWS keys (`AKIA…`), GitHub tokens (`ghp_…`), Slack/Stripe/Google/OpenAI keys, JWTs, private-key blocks, and high-entropy token strings. This catches secrets hiding in custom auth headers, odd query params, or body fields, and raises `secret_in_nonstandard_location` so you know a credential is somewhere unexpected. UUIDs and other common identifiers are excluded to avoid false positives.
|
|
20
|
+
|
|
21
|
+
## Compile an API pack
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
tessera api compile --input examples/api/ --output ./out/api_pack
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Artifacts written:
|
|
28
|
+
|
|
29
|
+
```text
|
|
30
|
+
index.jsonl canonical, redacted ApiRequest rows
|
|
31
|
+
index.md human-readable catalog (method, host, path, auth, redactions)
|
|
32
|
+
validation_report.md hygiene findings
|
|
33
|
+
coverage_report.md method / host / auth-kind distribution
|
|
34
|
+
redactions_report.md every redaction made, with masked previews (audit trail)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Validation rules
|
|
38
|
+
|
|
39
|
+
Per-request:
|
|
40
|
+
|
|
41
|
+
- `insecure_scheme` — uses `http://` (cleartext)
|
|
42
|
+
- `missing_host` — no host could be parsed
|
|
43
|
+
- `secret_in_url_query` — a secret was found in the URL query (URLs get logged; prefer a header)
|
|
44
|
+
- `no_auth_detected` — no auth credential was found
|
|
45
|
+
|
|
46
|
+
Cross-request:
|
|
47
|
+
|
|
48
|
+
- `duplicate_request` — identical method + url + body seen more than once
|
|
49
|
+
- `multiple_hosts` — requests span more than one host (visibility, not an error)
|
|
50
|
+
|
|
51
|
+
Plus `parse_error` for any curl command that cannot be tokenized or has no URL.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tesserakit-api"
|
|
7
|
+
version = "0.3.1"
|
|
8
|
+
description = "API job pack for Tessera: parse curl/HTTP traces into a validated, secret-redacted API surface map."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Tessera" }]
|
|
12
|
+
dependencies = [
|
|
13
|
+
"tesserakit-core>=0.1.0",
|
|
14
|
+
"typer>=0.12",
|
|
15
|
+
"rich>=13.7",
|
|
16
|
+
"pydantic>=2.7",
|
|
17
|
+
]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Environment :: Console",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
dev = ["pytest>=8.0"]
|
|
27
|
+
|
|
28
|
+
[project.entry-points."tessera.commands"]
|
|
29
|
+
api = "tessera_api.cli:register"
|
|
30
|
+
|
|
31
|
+
[project.entry-points."tessera.jobpacks"]
|
|
32
|
+
api = "tessera_api.pack:create_pack"
|
|
33
|
+
|
|
34
|
+
[tool.hatch.build.targets.wheel]
|
|
35
|
+
packages = ["src/tessera_api"]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from tessera_core.models import RunContext
|
|
10
|
+
|
|
11
|
+
from tessera_api.pack import ApiPack
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
api_app = typer.Typer(help="Parse curl/HTTP traces into a validated, redacted API surface map.")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@api_app.command("compile")
|
|
18
|
+
def compile_cmd(
|
|
19
|
+
input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="A .curl/.sh file or a directory of them."),
|
|
20
|
+
output: Path = typer.Option(Path("api_pack"), "--output", "-o", help="Output directory."),
|
|
21
|
+
) -> None:
|
|
22
|
+
"""Parse curl commands into canonical, secret-redacted API request records."""
|
|
23
|
+
ctx = RunContext(job_name="api", output_dir=output)
|
|
24
|
+
pack = ApiPack()
|
|
25
|
+
artifacts = pack.run(input_path=input, ctx=ctx, options={})
|
|
26
|
+
|
|
27
|
+
table = Table(title="API Pack Created")
|
|
28
|
+
table.add_column("Artifact")
|
|
29
|
+
table.add_column("Path")
|
|
30
|
+
table.add_column("Kind")
|
|
31
|
+
for art in artifacts:
|
|
32
|
+
table.add_row(art.name, str(art.path), art.kind)
|
|
33
|
+
console.print(table)
|
|
34
|
+
|
|
35
|
+
summary = Table(title="Run Summary")
|
|
36
|
+
summary.add_column("Metric")
|
|
37
|
+
summary.add_column("Value")
|
|
38
|
+
summary.add_row("run_id", ctx.run_id)
|
|
39
|
+
summary.add_row("records", str(ctx.metadata.get("record_count", 0)))
|
|
40
|
+
summary.add_row("findings", str(ctx.metadata.get("finding_count", 0)))
|
|
41
|
+
console.print(summary)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def register(root_app: typer.Typer) -> None:
|
|
45
|
+
root_app.add_typer(api_app, name="api")
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from tessera_core.artifacts import write_jsonl, write_markdown
|
|
8
|
+
from tessera_core.models import Artifact, RunContext, ValidationFinding
|
|
9
|
+
|
|
10
|
+
from tessera_api.loader import load_api_records
|
|
11
|
+
from tessera_api.schema import ApiRequest
|
|
12
|
+
from tessera_api.validator import validate_api_records
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_records(input_path: Path, options: dict[str, Any]) -> list[ApiRequest]:
|
|
16
|
+
return load_api_records(input_path, options)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def validate_records(records: list[ApiRequest], options: dict[str, Any]) -> list[ValidationFinding]:
|
|
20
|
+
findings: list[ValidationFinding] = []
|
|
21
|
+
for err in options.get("_parse_errors", []):
|
|
22
|
+
findings.append(
|
|
23
|
+
ValidationFinding(
|
|
24
|
+
severity="error",
|
|
25
|
+
code="parse_error",
|
|
26
|
+
message=f"failed to parse curl: {err['error']} (near: {err.get('preview', '')})",
|
|
27
|
+
field=None,
|
|
28
|
+
metadata={"source_file": err.get("source_file", "")},
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
findings.extend(validate_api_records(records))
|
|
32
|
+
return findings
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def write_artifacts(
|
|
36
|
+
records: list[ApiRequest],
|
|
37
|
+
ctx: RunContext,
|
|
38
|
+
options: dict[str, Any],
|
|
39
|
+
) -> list[Artifact]:
|
|
40
|
+
ctx.output_dir.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
findings: list[ValidationFinding] = (
|
|
42
|
+
ctx.metadata.get("findings") or validate_records(records, options)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
index_jsonl = ctx.output_dir / "index.jsonl"
|
|
46
|
+
index_md = ctx.output_dir / "index.md"
|
|
47
|
+
validation_md = ctx.output_dir / "validation_report.md"
|
|
48
|
+
coverage_md = ctx.output_dir / "coverage_report.md"
|
|
49
|
+
redactions_md = ctx.output_dir / "redactions_report.md"
|
|
50
|
+
|
|
51
|
+
write_jsonl(index_jsonl, [r.model_dump() for r in records])
|
|
52
|
+
write_markdown(index_md, _render_index(records))
|
|
53
|
+
write_markdown(validation_md, _render_validation(records, findings, options))
|
|
54
|
+
write_markdown(coverage_md, _render_coverage(records))
|
|
55
|
+
write_markdown(redactions_md, _render_redactions(records))
|
|
56
|
+
|
|
57
|
+
return [
|
|
58
|
+
Artifact(name="index.jsonl", path=index_jsonl, kind="jsonl"),
|
|
59
|
+
Artifact(name="index.md", path=index_md, kind="markdown"),
|
|
60
|
+
Artifact(name="validation_report.md", path=validation_md, kind="markdown"),
|
|
61
|
+
Artifact(name="coverage_report.md", path=coverage_md, kind="markdown"),
|
|
62
|
+
Artifact(name="redactions_report.md", path=redactions_md, kind="markdown"),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _render_index(records: list[ApiRequest]) -> str:
|
|
67
|
+
lines = ["# API Request Catalog", ""]
|
|
68
|
+
lines.append(f"- Total requests: {len(records)}")
|
|
69
|
+
lines.append("")
|
|
70
|
+
if not records:
|
|
71
|
+
lines.append("_No requests found._")
|
|
72
|
+
return "\n".join(lines) + "\n"
|
|
73
|
+
|
|
74
|
+
lines.append("| ID | Method | Host | Path | Auth | Body | Redactions |")
|
|
75
|
+
lines.append("|---|---|---|---|---|---|---:|")
|
|
76
|
+
for r in records:
|
|
77
|
+
lines.append(
|
|
78
|
+
f"| `{r.id}` | {r.method} | {r.host} | {r.path} "
|
|
79
|
+
f"| {r.auth.kind} | {r.body_kind} | {len(r.redactions)} |"
|
|
80
|
+
)
|
|
81
|
+
lines.append("")
|
|
82
|
+
return "\n".join(lines)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _render_validation(
|
|
86
|
+
records: list[ApiRequest],
|
|
87
|
+
findings: list[ValidationFinding],
|
|
88
|
+
options: dict[str, Any],
|
|
89
|
+
) -> str:
|
|
90
|
+
lines = ["# Validation Report", ""]
|
|
91
|
+
lines.append(f"- Total requests: {len(records)}")
|
|
92
|
+
lines.append(f"- Findings: {len(findings)}")
|
|
93
|
+
lines.append(f"- Parse errors: {len(options.get('_parse_errors', []))}")
|
|
94
|
+
lines.append("")
|
|
95
|
+
|
|
96
|
+
by_severity = Counter(f.severity for f in findings)
|
|
97
|
+
lines.append("## Severity Breakdown")
|
|
98
|
+
lines.append("")
|
|
99
|
+
for sev in ("error", "warning", "info"):
|
|
100
|
+
lines.append(f"- {sev}: {by_severity.get(sev, 0)}")
|
|
101
|
+
lines.append("")
|
|
102
|
+
|
|
103
|
+
if findings:
|
|
104
|
+
lines.append("## Findings")
|
|
105
|
+
lines.append("")
|
|
106
|
+
for f in findings[:200]:
|
|
107
|
+
ident = f.metadata.get("id", "") if f.metadata else ""
|
|
108
|
+
who = f" `{ident}`" if ident else ""
|
|
109
|
+
field_part = f" [{f.field}]" if f.field else ""
|
|
110
|
+
lines.append(f"- **{f.severity.upper()}** `{f.code}`{who}{field_part}: {f.message}")
|
|
111
|
+
if len(findings) > 200:
|
|
112
|
+
lines.append(f"- ... {len(findings) - 200} more findings omitted")
|
|
113
|
+
return "\n".join(lines)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _render_coverage(records: list[ApiRequest]) -> str:
|
|
117
|
+
lines = ["# Coverage Report", ""]
|
|
118
|
+
lines.append(f"- Total requests: {len(records)}")
|
|
119
|
+
if not records:
|
|
120
|
+
return "\n".join(lines) + "\n"
|
|
121
|
+
|
|
122
|
+
method_dist = Counter(r.method for r in records)
|
|
123
|
+
host_dist = Counter(r.host for r in records)
|
|
124
|
+
auth_dist = Counter(r.auth.kind for r in records)
|
|
125
|
+
insecure = sum(1 for r in records if r.scheme == "http")
|
|
126
|
+
|
|
127
|
+
lines.append(f"- Insecure (http) requests: {insecure}")
|
|
128
|
+
lines.append("")
|
|
129
|
+
lines.append("## Methods")
|
|
130
|
+
lines.append("")
|
|
131
|
+
for method, count in method_dist.most_common():
|
|
132
|
+
lines.append(f"- `{method}`: {count}")
|
|
133
|
+
lines.append("")
|
|
134
|
+
lines.append("## Hosts")
|
|
135
|
+
lines.append("")
|
|
136
|
+
for host, count in host_dist.most_common():
|
|
137
|
+
lines.append(f"- `{host}`: {count}")
|
|
138
|
+
lines.append("")
|
|
139
|
+
lines.append("## Auth kinds")
|
|
140
|
+
lines.append("")
|
|
141
|
+
for kind, count in auth_dist.most_common():
|
|
142
|
+
lines.append(f"- `{kind}`: {count}")
|
|
143
|
+
return "\n".join(lines) + "\n"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _render_redactions(records: list[ApiRequest]) -> str:
|
|
147
|
+
lines = ["# Redactions Report", ""]
|
|
148
|
+
total = sum(len(r.redactions) for r in records)
|
|
149
|
+
lines.append(f"- Total redactions: {total}")
|
|
150
|
+
lines.append("")
|
|
151
|
+
lines.append("Every secret below was masked before any artifact was written. "
|
|
152
|
+
"Previews reveal at most a couple of leading characters and the length.")
|
|
153
|
+
lines.append("")
|
|
154
|
+
|
|
155
|
+
if total == 0:
|
|
156
|
+
lines.append("_No secrets detected._")
|
|
157
|
+
return "\n".join(lines) + "\n"
|
|
158
|
+
|
|
159
|
+
kind_dist: Counter[str] = Counter()
|
|
160
|
+
for r in records:
|
|
161
|
+
for red in r.redactions:
|
|
162
|
+
kind_dist[red.kind] += 1
|
|
163
|
+
lines.append("## By kind")
|
|
164
|
+
lines.append("")
|
|
165
|
+
for kind, count in kind_dist.most_common():
|
|
166
|
+
lines.append(f"- `{kind}`: {count}")
|
|
167
|
+
lines.append("")
|
|
168
|
+
|
|
169
|
+
lines.append("## Detail")
|
|
170
|
+
lines.append("")
|
|
171
|
+
lines.append("| Request | Location | Kind | Preview |")
|
|
172
|
+
lines.append("|---|---|---|---|")
|
|
173
|
+
for r in records:
|
|
174
|
+
for red in r.redactions:
|
|
175
|
+
lines.append(f"| `{r.id}` | {red.location} | {red.kind} | `{red.preview}` |")
|
|
176
|
+
return "\n".join(lines) + "\n"
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""Parse curl commands into canonical, redacted ApiRequest records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shlex
|
|
6
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
7
|
+
|
|
8
|
+
from tessera_api.redact import (
|
|
9
|
+
_TOKEN_PATTERNS,
|
|
10
|
+
auth_token_value,
|
|
11
|
+
classify_header_secret,
|
|
12
|
+
detect_secret_shape,
|
|
13
|
+
is_secret_header,
|
|
14
|
+
is_secret_query,
|
|
15
|
+
mask,
|
|
16
|
+
)
|
|
17
|
+
from tessera_api.schema import ApiAuth, ApiRequest, Redaction
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def split_curl_commands(text: str) -> list[str]:
|
|
21
|
+
"""Split a file's text into individual curl command strings.
|
|
22
|
+
|
|
23
|
+
Line continuations (trailing backslash) are joined first; then each block
|
|
24
|
+
that begins with a ``curl`` token starts a new command.
|
|
25
|
+
"""
|
|
26
|
+
joined = text.replace("\\\n", " ")
|
|
27
|
+
commands: list[str] = []
|
|
28
|
+
current: list[str] = []
|
|
29
|
+
for raw_line in joined.splitlines():
|
|
30
|
+
line = raw_line.strip()
|
|
31
|
+
if not line or line.startswith("#"):
|
|
32
|
+
continue
|
|
33
|
+
starts = line.split(None, 1)[0] == "curl" if line.split() else False
|
|
34
|
+
if starts and current:
|
|
35
|
+
commands.append(" ".join(current))
|
|
36
|
+
current = [line]
|
|
37
|
+
else:
|
|
38
|
+
current.append(line)
|
|
39
|
+
if current:
|
|
40
|
+
commands.append(" ".join(current))
|
|
41
|
+
return [c for c in commands if c.strip().startswith("curl")]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse_curl(command: str, record_id: str) -> ApiRequest:
|
|
45
|
+
"""Parse a single curl command string into a redacted ApiRequest.
|
|
46
|
+
|
|
47
|
+
Raises ValueError if the command cannot be tokenized or has no URL.
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
tokens = shlex.split(command)
|
|
51
|
+
except ValueError as exc:
|
|
52
|
+
raise ValueError(f"cannot tokenize curl command: {exc}") from exc
|
|
53
|
+
|
|
54
|
+
if not tokens or tokens[0] != "curl":
|
|
55
|
+
raise ValueError("not a curl command")
|
|
56
|
+
|
|
57
|
+
method: str | None = None
|
|
58
|
+
url: str | None = None
|
|
59
|
+
headers: dict[str, str] = {}
|
|
60
|
+
redactions: list[Redaction] = []
|
|
61
|
+
auth = ApiAuth()
|
|
62
|
+
body: str | None = None
|
|
63
|
+
body_kind = "none"
|
|
64
|
+
basic_user_pass: str | None = None
|
|
65
|
+
|
|
66
|
+
i = 1
|
|
67
|
+
while i < len(tokens):
|
|
68
|
+
tok = tokens[i]
|
|
69
|
+
if tok in ("-X", "--request"):
|
|
70
|
+
method = tokens[i + 1] if i + 1 < len(tokens) else method
|
|
71
|
+
i += 2
|
|
72
|
+
continue
|
|
73
|
+
if tok in ("-H", "--header"):
|
|
74
|
+
raw = tokens[i + 1] if i + 1 < len(tokens) else ""
|
|
75
|
+
_ingest_header(raw, headers, redactions, auth)
|
|
76
|
+
i += 2
|
|
77
|
+
continue
|
|
78
|
+
if tok in ("-u", "--user"):
|
|
79
|
+
basic_user_pass = tokens[i + 1] if i + 1 < len(tokens) else ""
|
|
80
|
+
i += 2
|
|
81
|
+
continue
|
|
82
|
+
if tok in ("-d", "--data", "--data-raw", "--data-binary", "--data-ascii"):
|
|
83
|
+
raw_body = tokens[i + 1] if i + 1 < len(tokens) else ""
|
|
84
|
+
body_kind = "json" if _looks_json(raw_body) else "form"
|
|
85
|
+
body, body_redactions = _redact_body(raw_body)
|
|
86
|
+
redactions.extend(body_redactions)
|
|
87
|
+
i += 2
|
|
88
|
+
continue
|
|
89
|
+
if tok in ("--url",):
|
|
90
|
+
url = tokens[i + 1] if i + 1 < len(tokens) else url
|
|
91
|
+
i += 2
|
|
92
|
+
continue
|
|
93
|
+
if tok in ("--compressed", "-s", "--silent", "-L", "--location", "-k", "--insecure", "-i", "--include", "-v", "--verbose", "-g", "--globoff"):
|
|
94
|
+
i += 1
|
|
95
|
+
continue
|
|
96
|
+
if tok.startswith("-"):
|
|
97
|
+
# Unknown flag; skip it and a value if the next token is not a URL.
|
|
98
|
+
if i + 1 < len(tokens) and not _is_url(tokens[i + 1]) and not tokens[i + 1].startswith("-"):
|
|
99
|
+
i += 2
|
|
100
|
+
else:
|
|
101
|
+
i += 1
|
|
102
|
+
continue
|
|
103
|
+
# positional: treat as URL
|
|
104
|
+
if url is None and _is_url(tok):
|
|
105
|
+
url = tok
|
|
106
|
+
i += 1
|
|
107
|
+
|
|
108
|
+
if url is None:
|
|
109
|
+
raise ValueError("no URL found in curl command")
|
|
110
|
+
|
|
111
|
+
# Basic auth via -u
|
|
112
|
+
if basic_user_pass is not None:
|
|
113
|
+
user = basic_user_pass.split(":", 1)[0]
|
|
114
|
+
auth = ApiAuth(kind="basic", location="flag:-u", present=True)
|
|
115
|
+
redactions.append(
|
|
116
|
+
Redaction(location="flag:-u", kind="basic_credentials", preview=f"{mask(user)} : (password redacted)")
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
scheme, host, path, redacted_query, query_map, url_redactions = _split_and_redact_url(url)
|
|
120
|
+
redactions.extend(url_redactions)
|
|
121
|
+
|
|
122
|
+
# If a secret query param looks like auth and no header auth was found.
|
|
123
|
+
if auth.kind == "none":
|
|
124
|
+
for qname in query_map:
|
|
125
|
+
if qname.lower() in ("api_key", "apikey", "key", "access_token", "token"):
|
|
126
|
+
auth = ApiAuth(kind="api_key_query", location=f"query:{qname}", present=True)
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
if method is None:
|
|
130
|
+
method = "POST" if body is not None else "GET"
|
|
131
|
+
|
|
132
|
+
redacted_url = urlunsplit((scheme, host, path, redacted_query, ""))
|
|
133
|
+
|
|
134
|
+
return ApiRequest(
|
|
135
|
+
id=record_id,
|
|
136
|
+
method=method.upper(),
|
|
137
|
+
url=redacted_url,
|
|
138
|
+
scheme=scheme,
|
|
139
|
+
host=host,
|
|
140
|
+
path=path,
|
|
141
|
+
query=query_map,
|
|
142
|
+
headers=headers,
|
|
143
|
+
body=body, # already redacted in the -d handler
|
|
144
|
+
body_kind=body_kind,
|
|
145
|
+
auth=auth,
|
|
146
|
+
redactions=redactions,
|
|
147
|
+
# Note: the raw command is never stored; it contains the unredacted
|
|
148
|
+
# secrets we just stripped. Only a safe synthesized summary is kept.
|
|
149
|
+
metadata={"summary": f"{method.upper()} {host}{path}"},
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _ingest_header(raw: str, headers: dict[str, str], redactions: list[Redaction], auth: ApiAuth) -> None:
|
|
154
|
+
if ":" not in raw:
|
|
155
|
+
headers[raw.strip()] = ""
|
|
156
|
+
return
|
|
157
|
+
name, value = raw.split(":", 1)
|
|
158
|
+
name = name.strip()
|
|
159
|
+
value = value.strip()
|
|
160
|
+
if is_secret_header(name):
|
|
161
|
+
kind = classify_header_secret(name, value)
|
|
162
|
+
cred = auth_token_value(value)
|
|
163
|
+
redactions.append(Redaction(location=f"header:{name.lower()}", kind=kind, preview=mask(cred)))
|
|
164
|
+
headers[name] = "(redacted)"
|
|
165
|
+
if kind == "bearer_token":
|
|
166
|
+
auth.kind = "bearer"
|
|
167
|
+
auth.location = f"header:{name}"
|
|
168
|
+
auth.present = True
|
|
169
|
+
elif kind == "basic_credentials":
|
|
170
|
+
auth.kind = "basic"
|
|
171
|
+
auth.location = f"header:{name}"
|
|
172
|
+
auth.present = True
|
|
173
|
+
elif kind == "api_key":
|
|
174
|
+
auth.kind = "api_key_header"
|
|
175
|
+
auth.location = f"header:{name}"
|
|
176
|
+
auth.present = True
|
|
177
|
+
else:
|
|
178
|
+
# not a known secret-named header: still screen the value by shape
|
|
179
|
+
shape = detect_secret_shape(value)
|
|
180
|
+
if shape:
|
|
181
|
+
redactions.append(Redaction(location=f"header:{name.lower()}", kind=shape, preview=mask(value)))
|
|
182
|
+
headers[name] = "(redacted)"
|
|
183
|
+
else:
|
|
184
|
+
headers[name] = value
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _split_and_redact_url(url: str):
|
|
188
|
+
parts = urlsplit(url)
|
|
189
|
+
query_pairs = parse_qsl(parts.query, keep_blank_values=True)
|
|
190
|
+
redactions: list[Redaction] = []
|
|
191
|
+
redacted_pairs: list[tuple[str, str]] = []
|
|
192
|
+
query_map: dict[str, str] = {}
|
|
193
|
+
for k, v in query_pairs:
|
|
194
|
+
if is_secret_query(k):
|
|
195
|
+
redactions.append(Redaction(location=f"query:{k}", kind="api_key", preview=mask(v)))
|
|
196
|
+
redacted_pairs.append((k, "(redacted)"))
|
|
197
|
+
query_map[k] = "(redacted)"
|
|
198
|
+
else:
|
|
199
|
+
shape = detect_secret_shape(v)
|
|
200
|
+
if shape:
|
|
201
|
+
redactions.append(Redaction(location=f"query:{k}", kind=shape, preview=mask(v)))
|
|
202
|
+
redacted_pairs.append((k, "(redacted)"))
|
|
203
|
+
query_map[k] = "(redacted)"
|
|
204
|
+
else:
|
|
205
|
+
redacted_pairs.append((k, v))
|
|
206
|
+
query_map[k] = v
|
|
207
|
+
redacted_query = urlencode(redacted_pairs)
|
|
208
|
+
return parts.scheme, parts.netloc, parts.path, redacted_query, query_map, redactions
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _redact_body(body: str) -> tuple[str, list[Redaction]]:
|
|
212
|
+
"""Redact secret-keyed fields in a JSON-ish or form body, reporting each.
|
|
213
|
+
|
|
214
|
+
Returns the redacted body and a Redaction per field masked, so body
|
|
215
|
+
secrets appear in the audit trail like header and query secrets do.
|
|
216
|
+
"""
|
|
217
|
+
import re
|
|
218
|
+
|
|
219
|
+
redactions: list[Redaction] = []
|
|
220
|
+
secret_keys = r"password|passwd|pwd|secret|client_secret|token|access_token|api_key|apikey"
|
|
221
|
+
|
|
222
|
+
def json_sub(m: "re.Match[str]") -> str:
|
|
223
|
+
key, value = m.group(1), m.group(3)
|
|
224
|
+
redactions.append(Redaction(location=f"body:{key.lower()}", kind="body_secret", preview=mask(value)))
|
|
225
|
+
return f'{m.group(2)}(redacted)"'
|
|
226
|
+
|
|
227
|
+
def form_sub(m: "re.Match[str]") -> str:
|
|
228
|
+
key, value = m.group(1), m.group(2)
|
|
229
|
+
redactions.append(Redaction(location=f"body:{key.lower()}", kind="body_secret", preview=mask(value)))
|
|
230
|
+
return f"{key}=(redacted)"
|
|
231
|
+
|
|
232
|
+
redacted = re.sub(
|
|
233
|
+
rf'"({secret_keys})"(\s*:\s*")([^"]*)"',
|
|
234
|
+
json_sub,
|
|
235
|
+
body,
|
|
236
|
+
flags=re.IGNORECASE,
|
|
237
|
+
)
|
|
238
|
+
redacted = re.sub(
|
|
239
|
+
rf"\b({secret_keys})=([^&\s]+)",
|
|
240
|
+
form_sub,
|
|
241
|
+
redacted,
|
|
242
|
+
flags=re.IGNORECASE,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# shape-based pass: provider tokens embedded anywhere in the body text,
|
|
246
|
+
# regardless of the surrounding key name (catches secrets in odd fields)
|
|
247
|
+
for kind, pat in _TOKEN_PATTERNS:
|
|
248
|
+
def tok_sub(m: "re.Match[str]", _kind: str = kind) -> str:
|
|
249
|
+
redactions.append(Redaction(location="body", kind=_kind, preview=mask(m.group(0))))
|
|
250
|
+
return "(redacted)"
|
|
251
|
+
redacted = pat.sub(tok_sub, redacted)
|
|
252
|
+
|
|
253
|
+
return redacted, redactions
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _looks_json(body: str) -> bool:
|
|
257
|
+
s = body.strip()
|
|
258
|
+
return s.startswith("{") or s.startswith("[")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _is_url(token: str) -> bool:
|
|
262
|
+
return token.startswith("http://") or token.startswith("https://")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from tessera_api.curl import parse_curl, split_curl_commands
|
|
7
|
+
from tessera_api.schema import ApiRequest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def discover_curl_files(root: Path) -> list[Path]:
|
|
11
|
+
"""Find curl-bearing files: ``*.curl`` and ``*.sh`` (or a single file)."""
|
|
12
|
+
if root.is_file():
|
|
13
|
+
return [root]
|
|
14
|
+
found: list[Path] = []
|
|
15
|
+
for path in sorted(root.rglob("*")):
|
|
16
|
+
if path.is_file() and path.suffix in (".curl", ".sh"):
|
|
17
|
+
found.append(path)
|
|
18
|
+
return found
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_api_records(input_path: Path, options: dict[str, Any]) -> list[ApiRequest]:
|
|
22
|
+
"""Parse every curl command in the input into redacted ApiRequest records."""
|
|
23
|
+
files = discover_curl_files(input_path)
|
|
24
|
+
records: list[ApiRequest] = []
|
|
25
|
+
parse_errors: list[dict[str, str]] = []
|
|
26
|
+
|
|
27
|
+
seq = 0
|
|
28
|
+
for path in files:
|
|
29
|
+
text = path.read_text(encoding="utf-8")
|
|
30
|
+
for cmd in split_curl_commands(text):
|
|
31
|
+
seq += 1
|
|
32
|
+
rid = f"{path.stem}_{seq}"
|
|
33
|
+
try:
|
|
34
|
+
rec = parse_curl(cmd, rid)
|
|
35
|
+
rec.metadata["source_file"] = str(path)
|
|
36
|
+
records.append(rec)
|
|
37
|
+
except ValueError as exc:
|
|
38
|
+
parse_errors.append({"source_file": str(path), "error": str(exc), "preview": cmd[:80]})
|
|
39
|
+
|
|
40
|
+
options["_parse_errors"] = parse_errors
|
|
41
|
+
options["_input_path"] = str(input_path)
|
|
42
|
+
return records
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from tessera_core.jobpack import JobPack
|
|
7
|
+
from tessera_core.models import Artifact, RunContext, ValidationFinding
|
|
8
|
+
|
|
9
|
+
from tessera_api.compiler import load_records, validate_records, write_artifacts
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ApiPack(JobPack):
|
|
13
|
+
name = "api"
|
|
14
|
+
version = "0.3.1"
|
|
15
|
+
|
|
16
|
+
def normalize(self, input_path: Path, options: dict[str, Any]) -> list[Any]:
|
|
17
|
+
return load_records(input_path, options)
|
|
18
|
+
|
|
19
|
+
def validate(
|
|
20
|
+
self,
|
|
21
|
+
records: list[Any],
|
|
22
|
+
options: dict[str, Any],
|
|
23
|
+
) -> list[ValidationFinding]:
|
|
24
|
+
return validate_records(records, options)
|
|
25
|
+
|
|
26
|
+
def generate(
|
|
27
|
+
self,
|
|
28
|
+
records: list[Any],
|
|
29
|
+
ctx: RunContext,
|
|
30
|
+
options: dict[str, Any],
|
|
31
|
+
) -> list[Artifact]:
|
|
32
|
+
return write_artifacts(records, ctx, options)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_pack() -> ApiPack:
|
|
36
|
+
return ApiPack()
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Secret detection and masking.
|
|
2
|
+
|
|
3
|
+
The contract for this module: given a raw value that may be a secret, return a
|
|
4
|
+
masked preview that reveals at most a few leading characters and never the tail.
|
|
5
|
+
All redaction happens before a value is written into an ``ApiRequest``; the
|
|
6
|
+
canonical record and every artifact hold only masked previews.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
# Header names whose values are always treated as secret.
|
|
15
|
+
SECRET_HEADER_NAMES = {
|
|
16
|
+
"authorization",
|
|
17
|
+
"proxy-authorization",
|
|
18
|
+
"x-api-key",
|
|
19
|
+
"api-key",
|
|
20
|
+
"apikey",
|
|
21
|
+
"x-auth-token",
|
|
22
|
+
"x-auth",
|
|
23
|
+
"x-access-token",
|
|
24
|
+
"x-secret",
|
|
25
|
+
"x-amz-security-token",
|
|
26
|
+
"cookie",
|
|
27
|
+
"set-cookie",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Query parameter names whose values are always treated as secret.
|
|
31
|
+
SECRET_QUERY_NAMES = {
|
|
32
|
+
"api_key",
|
|
33
|
+
"apikey",
|
|
34
|
+
"key",
|
|
35
|
+
"token",
|
|
36
|
+
"access_token",
|
|
37
|
+
"auth",
|
|
38
|
+
"auth_token",
|
|
39
|
+
"secret",
|
|
40
|
+
"client_secret",
|
|
41
|
+
"password",
|
|
42
|
+
"passwd",
|
|
43
|
+
"pwd",
|
|
44
|
+
"sig",
|
|
45
|
+
"signature",
|
|
46
|
+
"sas",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
_MASK = "(redacted"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def mask(value: str, lead: int = 2) -> str:
|
|
53
|
+
"""Return a masked preview: a few leading chars plus length, never the tail."""
|
|
54
|
+
value = value or ""
|
|
55
|
+
n = len(value)
|
|
56
|
+
if n == 0:
|
|
57
|
+
return "(redacted, empty)"
|
|
58
|
+
if n <= lead:
|
|
59
|
+
return f"…(redacted, len={n})"
|
|
60
|
+
return f"{value[:lead]}…(redacted, len={n})"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def is_secret_header(name: str) -> bool:
|
|
64
|
+
return name.strip().lower() in SECRET_HEADER_NAMES
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_secret_query(name: str) -> bool:
|
|
68
|
+
return name.strip().lower() in SECRET_QUERY_NAMES
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def classify_header_secret(name: str, value: str) -> str:
|
|
72
|
+
"""Return a redaction kind label for a secret header value."""
|
|
73
|
+
lname = name.strip().lower()
|
|
74
|
+
if lname in ("authorization", "proxy-authorization"):
|
|
75
|
+
low = value.strip().lower()
|
|
76
|
+
if low.startswith("bearer "):
|
|
77
|
+
return "bearer_token"
|
|
78
|
+
if low.startswith("basic "):
|
|
79
|
+
return "basic_credentials"
|
|
80
|
+
return "authorization_value"
|
|
81
|
+
if lname in ("cookie", "set-cookie"):
|
|
82
|
+
return "cookie"
|
|
83
|
+
return "api_key"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def auth_token_value(value: str) -> str:
|
|
87
|
+
"""Strip the scheme prefix (Bearer/Basic) so we mask only the credential."""
|
|
88
|
+
m = re.match(r"^\s*(bearer|basic)\s+(.*)$", value, re.IGNORECASE)
|
|
89
|
+
if m:
|
|
90
|
+
return m.group(2)
|
|
91
|
+
return value
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# --- shape-based secret detection -------------------------------------------
|
|
95
|
+
# High-confidence provider token patterns: (kind, pattern). These catch secrets
|
|
96
|
+
# regardless of the field name they appear in.
|
|
97
|
+
_TOKEN_PATTERNS: list[tuple[str, re.Pattern]] = [
|
|
98
|
+
("aws_access_key_id", re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b")),
|
|
99
|
+
("github_token", re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,}\b")),
|
|
100
|
+
("github_pat", re.compile(r"\bgithub_pat_[A-Za-z0-9_]{40,}\b")),
|
|
101
|
+
("slack_token", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
|
|
102
|
+
("stripe_key", re.compile(r"\b(?:sk|rk|pk)_(?:live|test)_[A-Za-z0-9]{16,}\b")),
|
|
103
|
+
("google_api_key", re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b")),
|
|
104
|
+
("openai_key", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
|
|
105
|
+
("jwt", re.compile(r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b")),
|
|
106
|
+
("private_key_block", re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH |PGP )?PRIVATE KEY-----")),
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _shannon_entropy(s: str) -> float:
|
|
111
|
+
if not s:
|
|
112
|
+
return 0.0
|
|
113
|
+
counts: dict[str, int] = {}
|
|
114
|
+
for ch in s:
|
|
115
|
+
counts[ch] = counts.get(ch, 0) + 1
|
|
116
|
+
n = len(s)
|
|
117
|
+
return -sum((c / n) * math.log2(c / n) for c in counts.values())
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def detect_secret_shape(value: str) -> str | None:
|
|
121
|
+
"""Return a secret-kind label if the value *looks* like a secret, else None.
|
|
122
|
+
|
|
123
|
+
First tries precise provider patterns, then a conservative high-entropy
|
|
124
|
+
heuristic for long, space-free, mixed-charset tokens.
|
|
125
|
+
"""
|
|
126
|
+
v = (value or "").strip()
|
|
127
|
+
if not v:
|
|
128
|
+
return None
|
|
129
|
+
for kind, pat in _TOKEN_PATTERNS:
|
|
130
|
+
if pat.search(v):
|
|
131
|
+
return kind
|
|
132
|
+
# Common non-secret identifiers that would otherwise look high-entropy.
|
|
133
|
+
if _UUID_RE.fullmatch(v):
|
|
134
|
+
return None
|
|
135
|
+
# entropy fallback: long, no spaces, looks token-ish (not a sentence/URL/path)
|
|
136
|
+
if len(v) >= 24 and " " not in v and "/" not in v and not v.startswith(("http://", "https://")):
|
|
137
|
+
token_chars = re.fullmatch(r"[A-Za-z0-9+/=_\-\.]+", v)
|
|
138
|
+
if token_chars and _shannon_entropy(v) >= 3.5:
|
|
139
|
+
return "high_entropy_value"
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
_UUID_RE = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
AuthKind = Literal["bearer", "basic", "api_key_header", "api_key_query", "none"]
|
|
8
|
+
BodyKind = Literal["json", "form", "text", "none"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Redaction(BaseModel):
|
|
12
|
+
"""A record of one secret that was removed before canonicalization.
|
|
13
|
+
|
|
14
|
+
``preview`` holds a masked hint only (never the full secret), so the
|
|
15
|
+
redactions report is safe to commit and review.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
location: str # e.g. "header:authorization", "query:api_key", "body"
|
|
19
|
+
kind: str # e.g. "bearer_token", "basic_credentials", "api_key"
|
|
20
|
+
preview: str # e.g. "sk-ab…(redacted, len=51)"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ApiAuth(BaseModel):
|
|
24
|
+
kind: AuthKind = "none"
|
|
25
|
+
location: str = "" # e.g. "header:Authorization", "query:api_key"
|
|
26
|
+
present: bool = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ApiRequest(BaseModel):
|
|
30
|
+
"""Canonical, secret-free API request record. Serialized to ``index.jsonl``."""
|
|
31
|
+
|
|
32
|
+
id: str
|
|
33
|
+
method: str = "GET"
|
|
34
|
+
url: str = "" # redacted form (query secrets masked)
|
|
35
|
+
scheme: str = ""
|
|
36
|
+
host: str = ""
|
|
37
|
+
path: str = ""
|
|
38
|
+
query: dict[str, str] = Field(default_factory=dict) # redacted values
|
|
39
|
+
headers: dict[str, str] = Field(default_factory=dict) # redacted values
|
|
40
|
+
body: str | None = None # redacted
|
|
41
|
+
body_kind: BodyKind = "none"
|
|
42
|
+
auth: ApiAuth = Field(default_factory=ApiAuth)
|
|
43
|
+
redactions: list[Redaction] = Field(default_factory=list)
|
|
44
|
+
tags: list[str] = Field(default_factory=list)
|
|
45
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from tessera_core.models import ValidationFinding
|
|
6
|
+
|
|
7
|
+
from tessera_api.redact import SECRET_HEADER_NAMES, SECRET_QUERY_NAMES
|
|
8
|
+
from tessera_api.schema import ApiRequest
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_api_records(records: list[ApiRequest]) -> list[ValidationFinding]:
|
|
12
|
+
findings: list[ValidationFinding] = []
|
|
13
|
+
|
|
14
|
+
for r in records:
|
|
15
|
+
findings.extend(_validate_one(r))
|
|
16
|
+
|
|
17
|
+
# Cross-record: duplicate method+url+body
|
|
18
|
+
seen: dict[tuple[str, str, str | None], int] = Counter()
|
|
19
|
+
for r in records:
|
|
20
|
+
seen[(r.method, r.url, r.body)] += 1
|
|
21
|
+
for (method, url, _body), count in seen.items():
|
|
22
|
+
if count > 1:
|
|
23
|
+
findings.append(
|
|
24
|
+
ValidationFinding(
|
|
25
|
+
severity="info",
|
|
26
|
+
code="duplicate_request",
|
|
27
|
+
message=f"{count} identical requests: {method} {url}",
|
|
28
|
+
field=None,
|
|
29
|
+
metadata={"method": method, "url": url, "count": count},
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Cross-record: surface multiple hosts (not an error, just visibility)
|
|
34
|
+
hosts = sorted({r.host for r in records if r.host})
|
|
35
|
+
if len(hosts) > 1:
|
|
36
|
+
findings.append(
|
|
37
|
+
ValidationFinding(
|
|
38
|
+
severity="info",
|
|
39
|
+
code="multiple_hosts",
|
|
40
|
+
message=f"requests span {len(hosts)} hosts: {', '.join(hosts)}",
|
|
41
|
+
field="host",
|
|
42
|
+
metadata={"hosts": hosts},
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
return findings
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _validate_one(r: ApiRequest) -> list[ValidationFinding]:
|
|
50
|
+
findings: list[ValidationFinding] = []
|
|
51
|
+
src = r.metadata.get("source_file", "")
|
|
52
|
+
|
|
53
|
+
def f(severity: str, code: str, message: str, field: str | None = None) -> ValidationFinding:
|
|
54
|
+
return ValidationFinding(
|
|
55
|
+
severity=severity, code=code, message=message, field=field,
|
|
56
|
+
metadata={"id": r.id, "source_file": src},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if r.scheme == "http":
|
|
60
|
+
findings.append(f("warning", "insecure_scheme",
|
|
61
|
+
f"{r.method} {r.host}{r.path} uses http; credentials and data are sent in cleartext",
|
|
62
|
+
"scheme"))
|
|
63
|
+
|
|
64
|
+
if not r.host:
|
|
65
|
+
findings.append(f("error", "missing_host", "request has no host", "host"))
|
|
66
|
+
|
|
67
|
+
# A secret in the query string is worse than in a header: URLs get logged.
|
|
68
|
+
query_redactions = [red for red in r.redactions if red.location.startswith("query:")]
|
|
69
|
+
if query_redactions:
|
|
70
|
+
names = ", ".join(red.location.split(":", 1)[1] for red in query_redactions)
|
|
71
|
+
findings.append(f("warning", "secret_in_url_query",
|
|
72
|
+
f"secret(s) in URL query ({names}); URLs are commonly logged, prefer a header",
|
|
73
|
+
"query"))
|
|
74
|
+
|
|
75
|
+
if not r.auth.present:
|
|
76
|
+
findings.append(f("info", "no_auth_detected",
|
|
77
|
+
f"{r.method} {r.host}{r.path} has no detectable auth", "auth"))
|
|
78
|
+
|
|
79
|
+
# A secret found by shape in a field whose NAME is not a known secret name
|
|
80
|
+
# is high-signal: a custom auth header or a token hiding in an odd field.
|
|
81
|
+
for red in r.redactions:
|
|
82
|
+
loc = red.location
|
|
83
|
+
if loc.startswith("header:"):
|
|
84
|
+
name = loc.split(":", 1)[1]
|
|
85
|
+
if name not in SECRET_HEADER_NAMES:
|
|
86
|
+
findings.append(f("warning", "secret_in_nonstandard_location",
|
|
87
|
+
f"a {red.kind} was detected in header '{name}', which is not a conventional secret header",
|
|
88
|
+
"headers"))
|
|
89
|
+
elif loc.startswith("query:"):
|
|
90
|
+
name = loc.split(":", 1)[1]
|
|
91
|
+
if name not in SECRET_QUERY_NAMES:
|
|
92
|
+
findings.append(f("warning", "secret_in_nonstandard_location",
|
|
93
|
+
f"a {red.kind} was detected in query param '{name}', which is not a conventional secret name",
|
|
94
|
+
"query"))
|
|
95
|
+
|
|
96
|
+
return findings
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from tessera_core.models import RunContext
|
|
7
|
+
|
|
8
|
+
from tessera_api.curl import parse_curl, split_curl_commands
|
|
9
|
+
from tessera_api.pack import ApiPack
|
|
10
|
+
from tessera_api.redact import mask
|
|
11
|
+
from tessera_api.schema import ApiRequest
|
|
12
|
+
from tessera_api.validator import validate_api_records
|
|
13
|
+
|
|
14
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
15
|
+
EXAMPLES_DIR = REPO_ROOT / "examples" / "api"
|
|
16
|
+
FIXTURES = Path(__file__).parent / "fixtures"
|
|
17
|
+
|
|
18
|
+
# Raw secret strings that must NEVER appear in any artifact. These are fake,
|
|
19
|
+
# non-functional demo values chosen to match no real provider key format.
|
|
20
|
+
LIVE_SECRET = "DEMOBEARERtokenABCDEFGHIJKLMNOPQRSTUVWXYZ01"
|
|
21
|
+
API_KEY = "DEMOAPIKEYabcdef0123456789"
|
|
22
|
+
QUERY_SECRET = "legacy_secret_key_998877"
|
|
23
|
+
BASIC_PASS = "hunter2"
|
|
24
|
+
BODY_PASS = "s3cr3t-p@ss"
|
|
25
|
+
ALL_SECRETS = [LIVE_SECRET, API_KEY, QUERY_SECRET, BASIC_PASS, BODY_PASS]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------- mask primitive ----------
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_mask_never_reveals_tail():
|
|
32
|
+
out = mask("supersecrettoken", lead=2)
|
|
33
|
+
assert out.startswith("su")
|
|
34
|
+
assert "token" not in out
|
|
35
|
+
assert "len=16" in out
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_mask_short_value():
|
|
39
|
+
assert "redacted" in mask("ab")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------- splitting ----------
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_split_multiple_commands():
|
|
46
|
+
text = (EXAMPLES_DIR / "payments.curl").read_text()
|
|
47
|
+
cmds = split_curl_commands(text)
|
|
48
|
+
assert len(cmds) == 3
|
|
49
|
+
assert all(c.startswith("curl") for c in cmds)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_split_joins_line_continuations():
|
|
53
|
+
text = "curl https://x.test/a \\\n -H 'Accept: application/json'"
|
|
54
|
+
cmds = split_curl_commands(text)
|
|
55
|
+
assert len(cmds) == 1
|
|
56
|
+
assert "Accept" in cmds[0]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------- parsing + redaction ----------
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_bearer_token_redacted_and_auth_detected():
|
|
63
|
+
cmd = f'curl -X GET https://api.test/v1/x -H "Authorization: Bearer {LIVE_SECRET}"'
|
|
64
|
+
r = parse_curl(cmd, "r1")
|
|
65
|
+
assert r.method == "GET"
|
|
66
|
+
assert r.auth.kind == "bearer"
|
|
67
|
+
assert r.auth.present
|
|
68
|
+
assert r.headers["Authorization"] == "(redacted)"
|
|
69
|
+
assert LIVE_SECRET not in json.dumps(r.model_dump())
|
|
70
|
+
assert any(red.kind == "bearer_token" for red in r.redactions)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_api_key_header_redacted():
|
|
74
|
+
cmd = f'curl https://api.test/v1/x -H "X-Api-Key: {API_KEY}"'
|
|
75
|
+
r = parse_curl(cmd, "r1")
|
|
76
|
+
assert r.auth.kind == "api_key_header"
|
|
77
|
+
assert API_KEY not in json.dumps(r.model_dump())
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_secret_in_query_redacted_and_auth_inferred():
|
|
81
|
+
cmd = f'curl "https://api.test/report?api_key={QUERY_SECRET}&format=csv"'
|
|
82
|
+
r = parse_curl(cmd, "r1")
|
|
83
|
+
assert r.query["api_key"] == "(redacted)"
|
|
84
|
+
assert r.query["format"] == "csv"
|
|
85
|
+
assert QUERY_SECRET not in r.url
|
|
86
|
+
assert QUERY_SECRET not in json.dumps(r.model_dump())
|
|
87
|
+
assert r.auth.kind == "api_key_query"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_basic_auth_flag_redacted():
|
|
91
|
+
cmd = f"curl -u admin:{BASIC_PASS} https://api.test/admin"
|
|
92
|
+
r = parse_curl(cmd, "r1")
|
|
93
|
+
assert r.auth.kind == "basic"
|
|
94
|
+
assert BASIC_PASS not in json.dumps(r.model_dump())
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_body_secret_redacted():
|
|
98
|
+
cmd = 'curl -X POST https://api.test/login -d \'{"username": "ops", "password": "s3cr3t-p@ss"}\''
|
|
99
|
+
r = parse_curl(cmd, "r1")
|
|
100
|
+
assert r.method == "POST"
|
|
101
|
+
assert BODY_PASS not in (r.body or "")
|
|
102
|
+
assert "ops" in (r.body or "") # non-secret field preserved
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_method_defaults_to_post_when_body_present():
|
|
106
|
+
r = parse_curl('curl https://api.test/x -d \'{"a":1}\'', "r1")
|
|
107
|
+
assert r.method == "POST"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_no_url_raises():
|
|
111
|
+
import pytest
|
|
112
|
+
|
|
113
|
+
with pytest.raises(ValueError):
|
|
114
|
+
parse_curl('curl -X POST -H "Accept: application/json"', "r1")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------- end-to-end ----------
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_pack_creates_expected_artifacts(tmp_path: Path):
|
|
121
|
+
out = tmp_path / "api_pack"
|
|
122
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
123
|
+
artifacts = ApiPack().run(input_path=EXAMPLES_DIR, ctx=ctx, options={})
|
|
124
|
+
|
|
125
|
+
names = {a.name for a in artifacts}
|
|
126
|
+
assert names == {
|
|
127
|
+
"index.jsonl",
|
|
128
|
+
"index.md",
|
|
129
|
+
"validation_report.md",
|
|
130
|
+
"coverage_report.md",
|
|
131
|
+
"redactions_report.md",
|
|
132
|
+
}
|
|
133
|
+
for art in artifacts:
|
|
134
|
+
assert art.path.exists()
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_no_secret_leaks_into_any_artifact(tmp_path: Path):
|
|
138
|
+
"""The headline safety guarantee: no raw secret appears in any output file."""
|
|
139
|
+
out = tmp_path / "api_pack"
|
|
140
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
141
|
+
ApiPack().run(input_path=EXAMPLES_DIR, ctx=ctx, options={})
|
|
142
|
+
|
|
143
|
+
for artifact_file in out.iterdir():
|
|
144
|
+
content = artifact_file.read_text(encoding="utf-8")
|
|
145
|
+
for secret in ALL_SECRETS:
|
|
146
|
+
assert secret not in content, f"{secret!r} leaked into {artifact_file.name}"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_redactions_report_lists_every_secret(tmp_path: Path):
|
|
150
|
+
out = tmp_path / "api_pack"
|
|
151
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
152
|
+
ApiPack().run(input_path=EXAMPLES_DIR, ctx=ctx, options={})
|
|
153
|
+
report = (out / "redactions_report.md").read_text()
|
|
154
|
+
assert "bearer_token" in report
|
|
155
|
+
assert "api_key" in report
|
|
156
|
+
assert "basic_credentials" in report
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_validation_flags_http_and_query_secret(tmp_path: Path):
|
|
160
|
+
out = tmp_path / "api_pack"
|
|
161
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
162
|
+
ApiPack().run(input_path=EXAMPLES_DIR / "legacy.curl", ctx=ctx, options={})
|
|
163
|
+
codes = {f.code for f in ctx.metadata["findings"]}
|
|
164
|
+
assert "insecure_scheme" in codes
|
|
165
|
+
assert "secret_in_url_query" in codes
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_parse_error_surfaced(tmp_path: Path):
|
|
169
|
+
out = tmp_path / "api_pack"
|
|
170
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
171
|
+
ApiPack().run(input_path=FIXTURES / "unparseable.curl", ctx=ctx, options={})
|
|
172
|
+
codes = {f.code for f in ctx.metadata["findings"]}
|
|
173
|
+
assert "parse_error" in codes
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def test_index_jsonl_pydantic_round_trip(tmp_path: Path):
|
|
177
|
+
out = tmp_path / "api_pack"
|
|
178
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
179
|
+
ApiPack().run(input_path=EXAMPLES_DIR, ctx=ctx, options={})
|
|
180
|
+
for line in (out / "index.jsonl").read_text().splitlines():
|
|
181
|
+
restored = ApiRequest.model_validate_json(line)
|
|
182
|
+
assert restored.id
|
|
183
|
+
assert restored.method
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_duplicate_request_detected():
|
|
187
|
+
cmd = "curl https://api.test/same"
|
|
188
|
+
r1 = parse_curl(cmd, "a")
|
|
189
|
+
r2 = parse_curl(cmd, "b")
|
|
190
|
+
findings = validate_api_records([r1, r2])
|
|
191
|
+
assert any(f.code == "duplicate_request" for f in findings)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ---------- v0.2: shape-based secret detection ----------
|
|
195
|
+
# Tokens are constructed at runtime (concatenation) so no real-provider-shaped
|
|
196
|
+
# literal is committed to the repo (which would trip secret-scanning).
|
|
197
|
+
|
|
198
|
+
from tessera_api.redact import detect_secret_shape # noqa: E402
|
|
199
|
+
|
|
200
|
+
_GH = "ghp_" + "A1b2C3d4" * 5 # ghp_ + 40 chars -> github_token shape
|
|
201
|
+
_AWS = "AKIA" + "B2C3D4E5F6G7H8I9" # AKIA + 16 -> aws_access_key_id
|
|
202
|
+
_JWT = "eyJ" + "abcDEF123" + "." + "ghiJKL456" + "." + "mnoPQR789" # jwt shape
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def test_detect_secret_shape_patterns():
|
|
206
|
+
assert detect_secret_shape(_GH) == "github_token"
|
|
207
|
+
assert detect_secret_shape(_AWS) == "aws_access_key_id"
|
|
208
|
+
assert detect_secret_shape(_JWT) == "jwt"
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def test_detect_secret_shape_negatives():
|
|
212
|
+
assert detect_secret_shape("application/json") is None
|
|
213
|
+
assert detect_secret_shape("Mozilla/5.0") is None
|
|
214
|
+
assert detect_secret_shape("123e4567-e89b-12d3-a456-426614174000") is None # UUID
|
|
215
|
+
assert detect_secret_shape("short") is None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_secret_in_custom_header_redacted(tmp_path):
|
|
219
|
+
cmd = f'curl https://api.test/v1/data -H "X-Trace-Token: {_GH}" -H "Accept: application/json"'
|
|
220
|
+
r = parse_curl(cmd, "r1")
|
|
221
|
+
# the custom header value is redacted and the raw token is gone
|
|
222
|
+
assert r.headers["X-Trace-Token"] == "(redacted)"
|
|
223
|
+
assert _GH not in json.dumps(r.model_dump())
|
|
224
|
+
assert any(red.kind == "github_token" for red in r.redactions)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_secret_in_nonstandard_location_finding(tmp_path):
|
|
228
|
+
df = tmp_path / "custom.curl"
|
|
229
|
+
df.write_text(f'curl "https://api.test/items?trace={_JWT}"\n', encoding="utf-8")
|
|
230
|
+
out = tmp_path / "api_pack"
|
|
231
|
+
ctx = RunContext(job_name="api", output_dir=out)
|
|
232
|
+
ApiPack().run(input_path=df, ctx=ctx, options={})
|
|
233
|
+
codes = {f.code for f in ctx.metadata["findings"]}
|
|
234
|
+
assert "secret_in_nonstandard_location" in codes
|
|
235
|
+
# and the raw token leaked nowhere
|
|
236
|
+
for artifact_file in out.iterdir():
|
|
237
|
+
assert _JWT not in artifact_file.read_text(encoding="utf-8")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_shape_secret_in_body(tmp_path):
|
|
241
|
+
cmd = f"curl -X POST https://api.test/x -d '{{\"note\": \"{_AWS}\"}}'"
|
|
242
|
+
r = parse_curl(cmd, "r1")
|
|
243
|
+
assert _AWS not in (r.body or "")
|
|
244
|
+
assert any(red.kind == "aws_access_key_id" for red in r.redactions)
|