policystrata 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. policystrata-0.1.0/CHANGELOG.md +21 -0
  2. policystrata-0.1.0/CITATION.cff +10 -0
  3. policystrata-0.1.0/CONTRIBUTING.md +31 -0
  4. policystrata-0.1.0/EVAL_CARD.md +98 -0
  5. policystrata-0.1.0/LICENSE +21 -0
  6. policystrata-0.1.0/MANIFEST.in +15 -0
  7. policystrata-0.1.0/PKG-INFO +324 -0
  8. policystrata-0.1.0/README.md +286 -0
  9. policystrata-0.1.0/SECURITY.md +24 -0
  10. policystrata-0.1.0/action.yml +70 -0
  11. policystrata-0.1.0/docker-compose.yml +14 -0
  12. policystrata-0.1.0/docs/benchmark-reference.md +187 -0
  13. policystrata-0.1.0/docs/distribution-roadmap.md +50 -0
  14. policystrata-0.1.0/docs/evidence.md +164 -0
  15. policystrata-0.1.0/docs/external-suite-protocol.md +75 -0
  16. policystrata-0.1.0/docs/failure-taxonomy.md +43 -0
  17. policystrata-0.1.0/docs/github-action.md +64 -0
  18. policystrata-0.1.0/docs/incident-reconstruction-template.md +59 -0
  19. policystrata-0.1.0/docs/methodology.md +222 -0
  20. policystrata-0.1.0/docs/open-source-commercial-strategy.md +88 -0
  21. policystrata-0.1.0/docs/scanner.md +158 -0
  22. policystrata-0.1.0/docs/trace-interop.md +44 -0
  23. policystrata-0.1.0/examples/integrations/dbt_semantic/finance_saas/semantic_models.yml +57 -0
  24. policystrata-0.1.0/examples/postgres_dbt/policystrata.yaml +28 -0
  25. policystrata-0.1.0/examples/postgres_dbt/policystrata_clean.yaml +10 -0
  26. policystrata-0.1.0/examples/postgres_dbt/policystrata_real_db_clean.yaml +32 -0
  27. policystrata-0.1.0/examples/postgres_dbt/semantic_models.yml +65 -0
  28. policystrata-0.1.0/examples/postgres_dbt/traces.jsonl +2 -0
  29. policystrata-0.1.0/examples/postgres_dbt/traces_clean.jsonl +1 -0
  30. policystrata-0.1.0/examples/postgres_dbt/traces_real_db_clean.jsonl +1 -0
  31. policystrata-0.1.0/pyproject.toml +91 -0
  32. policystrata-0.1.0/scripts/postgres-rls-evidence.py +51 -0
  33. policystrata-0.1.0/scripts/reproduce-evidence.sh +28 -0
  34. policystrata-0.1.0/setup.cfg +4 -0
  35. policystrata-0.1.0/src/policystrata/__init__.py +3 -0
  36. policystrata-0.1.0/src/policystrata/__main__.py +4 -0
  37. policystrata-0.1.0/src/policystrata/artifact_report.py +150 -0
  38. policystrata-0.1.0/src/policystrata/baselines.py +76 -0
  39. policystrata-0.1.0/src/policystrata/cli.py +229 -0
  40. policystrata-0.1.0/src/policystrata/compiler.py +179 -0
  41. policystrata-0.1.0/src/policystrata/database.py +84 -0
  42. policystrata-0.1.0/src/policystrata/demo.py +66 -0
  43. policystrata-0.1.0/src/policystrata/detection.py +115 -0
  44. policystrata-0.1.0/src/policystrata/domain.py +214 -0
  45. policystrata-0.1.0/src/policystrata/domains/finance_saas/policy.yaml +122 -0
  46. policystrata-0.1.0/src/policystrata/domains/finance_saas/schema.sql +83 -0
  47. policystrata-0.1.0/src/policystrata/domains/finance_saas/seed.sql +31 -0
  48. policystrata-0.1.0/src/policystrata/domains/finance_saas/surfaces.yaml +80 -0
  49. policystrata-0.1.0/src/policystrata/domains/finance_saas/tasks/seeded.yaml +130 -0
  50. policystrata-0.1.0/src/policystrata/domains/support_saas/policy.yaml +125 -0
  51. policystrata-0.1.0/src/policystrata/domains/support_saas/schema.sql +122 -0
  52. policystrata-0.1.0/src/policystrata/domains/support_saas/seed.sql +46 -0
  53. policystrata-0.1.0/src/policystrata/domains/support_saas/surfaces.yaml +80 -0
  54. policystrata-0.1.0/src/policystrata/domains/support_saas/tasks/seeded.yaml +142 -0
  55. policystrata-0.1.0/src/policystrata/evidence.py +149 -0
  56. policystrata-0.1.0/src/policystrata/exports.py +101 -0
  57. policystrata-0.1.0/src/policystrata/generator.py +222 -0
  58. policystrata-0.1.0/src/policystrata/integrations/__init__.py +1 -0
  59. policystrata-0.1.0/src/policystrata/integrations/dbt_semantic.py +169 -0
  60. policystrata-0.1.0/src/policystrata/minimize.py +125 -0
  61. policystrata-0.1.0/src/policystrata/models.py +227 -0
  62. policystrata-0.1.0/src/policystrata/mutations.py +117 -0
  63. policystrata-0.1.0/src/policystrata/policy.py +85 -0
  64. policystrata-0.1.0/src/policystrata/py.typed +1 -0
  65. policystrata-0.1.0/src/policystrata/runner.py +403 -0
  66. policystrata-0.1.0/src/policystrata/scan_models.py +203 -0
  67. policystrata-0.1.0/src/policystrata/scanner.py +1274 -0
  68. policystrata-0.1.0/src/policystrata/summary.py +81 -0
  69. policystrata-0.1.0/src/policystrata/trace_import.py +284 -0
  70. policystrata-0.1.0/src/policystrata.egg-info/PKG-INFO +324 -0
  71. policystrata-0.1.0/src/policystrata.egg-info/SOURCES.txt +87 -0
  72. policystrata-0.1.0/src/policystrata.egg-info/dependency_links.txt +1 -0
  73. policystrata-0.1.0/src/policystrata.egg-info/entry_points.txt +2 -0
  74. policystrata-0.1.0/src/policystrata.egg-info/requires.txt +10 -0
  75. policystrata-0.1.0/src/policystrata.egg-info/top_level.txt +1 -0
  76. policystrata-0.1.0/tests/test_cli.py +261 -0
  77. policystrata-0.1.0/tests/test_compiler.py +72 -0
  78. policystrata-0.1.0/tests/test_database_assets.py +25 -0
  79. policystrata-0.1.0/tests/test_detection.py +69 -0
  80. policystrata-0.1.0/tests/test_domain.py +142 -0
  81. policystrata-0.1.0/tests/test_evidence.py +55 -0
  82. policystrata-0.1.0/tests/test_integrations.py +15 -0
  83. policystrata-0.1.0/tests/test_intentional_asymmetry.py +45 -0
  84. policystrata-0.1.0/tests/test_minimize.py +45 -0
  85. policystrata-0.1.0/tests/test_policy.py +83 -0
  86. policystrata-0.1.0/tests/test_postgres_integration.py +98 -0
  87. policystrata-0.1.0/tests/test_runner.py +239 -0
  88. policystrata-0.1.0/tests/test_scanner.py +295 -0
  89. policystrata-0.1.0/uv.lock +740 -0
@@ -0,0 +1,21 @@
1
+ # Changelog
2
+
3
+ ## [Unreleased]
4
+
5
+ - No changes yet.
6
+
7
+ ## [0.1.0] - 2026-06-25
8
+
9
+ - Initial public research artifact.
10
+ - Deterministic `support_saas` and `finance_saas` benchmark domains.
11
+ - Seeded and generated mutation suites for cross-layer policy drift.
12
+ - Traces, summaries, baselines, evidence tables, minimized witnesses, scanner fixtures, and Docker
13
+ PostgreSQL evidence support.
14
+ - Public release files, CI, GitHub Action wrapper, and source distribution manifest coverage.
15
+ - Eval-card governance, scanner regression-case labels, database state assertions, and
16
+ Inspect/BenchFlow export adapters.
17
+ - Suite provenance, evidence-level, and detector-freeze metadata for future blinded or externally
18
+ authored suites.
19
+ - `defense_in_depth_stack` baseline and scanner `evidence_exercised` reporting for clean scans.
20
+ - Artifact usability report command for reviewer-facing run, witness, latency, and fixture metrics.
21
+ - arXiv-ready paper source and same-day submission notes under `paper/arxiv`.
@@ -0,0 +1,10 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use PolicyStrata in research, please cite it."
3
+ title: "PolicyStrata"
4
+ type: software
5
+ version: "0.1.0"
6
+ date-released: "2026-06-25"
7
+ license: MIT
8
+ repository-code: "https://github.com/raintree-technology/policystrata"
9
+ authors:
10
+ - name: "Raintree Technology"
@@ -0,0 +1,31 @@
1
+ # Contributing
2
+
3
+ PolicyStrata is a deterministic research artifact. Keep changes reproducible, scoped, and explicit
4
+ about what the evidence does and does not prove.
5
+
6
+ ```bash
7
+ uv sync --extra dev
8
+ uv run pytest
9
+ uv run ruff check .
10
+ uv run mypy src
11
+ ```
12
+
13
+ Optional PostgreSQL tests:
14
+
15
+ ```bash
16
+ docker compose up -d postgres
17
+ POLICYSTRATA_RUN_DB_TESTS=1 uv run pytest tests/test_postgres_integration.py
18
+ ```
19
+
20
+ - Keep the policy oracle independent from the SQL compiler path.
21
+ - Treat constrained generation as a reliability layer, not an authorization boundary.
22
+ - Preserve JSON/YAML trace stability. Add fields compatibly.
23
+ - Keep the built-in `support_saas` domain deterministic and seed-driven.
24
+ - Use adapters for external frameworks. Do not couple core execution to them.
25
+ - Do not require an LLM API key, hosted service, or host `psql` for deterministic tests.
26
+
27
+ When evidence behavior changes, regenerate the tables:
28
+
29
+ ```bash
30
+ scripts/reproduce-evidence.sh
31
+ ```
@@ -0,0 +1,98 @@
1
+ # PolicyStrata Eval Card
2
+
3
+ PolicyStrata is a deterministic policy-regression environment for governed LLM data-agent stacks.
4
+ It is not an authorization boundary, a generic LLM leaderboard, or a claim of production incident
5
+ recall.
6
+
7
+ ## Scope
8
+
9
+ PolicyStrata evaluates whether authorization, semantic, database-containment, and release
10
+ obligations survive translation across policy-bearing surfaces:
11
+
12
+ - model-visible manifests;
13
+ - grammars and semantic IR;
14
+ - validators;
15
+ - SQL compilers;
16
+ - database controls;
17
+ - output-release checks.
18
+
19
+ The core artifact uses deterministic semantic plans and traces. It does not require an LLM API key.
20
+
21
+ ## Current Suites
22
+
23
+ | Suite | Provenance | Boundary |
24
+ | --- | --- | --- |
25
+ | `support_saas` seeded | public hand-authored fixture | regression coverage, not recall |
26
+ | `support_saas` generated | deterministic operator-generated cases | generated from the same public taxonomy |
27
+ | `support_saas` generated_alt_seed | secondary deterministic generated suite | reproducibility evidence, not blinded held-out evidence |
28
+ | `finance_saas` seeded | second synthetic built-in domain | reduces single-domain risk, still synthetic |
29
+
30
+ The current canonical evidence reports 620/620 killed non-equivalent mutants across these suites.
31
+ That means coverage over the implemented deterministic operators and fixtures. It does not mean
32
+ PolicyStrata detects all real-world policy drift.
33
+
34
+ Run metadata records each suite's evidence level, provenance, and detector-freeze status. Future
35
+ externally authored, detector-frozen, or incident-reconstruction suites should be reported
36
+ separately from this 620-mutant public deterministic score.
37
+
38
+ ## Scanner Evidence Levels
39
+
40
+ Scanner findings carry evidence levels:
41
+
42
+ - `deterministic_fixture`: built-in or explicitly configured fixtures.
43
+ - `property_generated`: generated SQL/IR mutants over configured inputs.
44
+ - `imported_trace`: imported production or representative traces.
45
+ - `real_db`: PostgreSQL fixture or RLS observations through Python adapters.
46
+ - `blinded_suite`: externally authored or detector-frozen suites when provided.
47
+
48
+ These levels describe what was exercised. They are not confidence intervals for unknown production
49
+ faults.
50
+
51
+ ## Regression Gate Semantics
52
+
53
+ PolicyStrata scanner traces and state assertions may be labeled:
54
+
55
+ - `fail_to_pass`: known drift evidence should now be caught or contained.
56
+ - `pass_to_pass`: legitimate behavior should stay clean.
57
+ - `contain_to_contain`: a risky request should remain contained by a later layer.
58
+ - `deny_to_deny`: a forbidden request should remain denied.
59
+ - `allow_to_allow`: an authorized request should remain usable.
60
+ - `unclassified`: legacy or unlabeled imported evidence.
61
+
62
+ Release gates should not rely only on failing examples. A useful gate includes both
63
+ `fail_to_pass` evidence and `pass_to_pass`/`allow_to_allow` maintenance evidence so fixes do not
64
+ create over-restriction regressions.
65
+
66
+ ## Real Database Boundary
67
+
68
+ Deterministic benchmark runs simulate database effects. The scanner can optionally prepare a
69
+ Docker/PostgreSQL fixture, execute read-only imported SQL beside canonical compiler SQL, run RLS
70
+ checks, and evaluate state assertions over result rows. Host `psql` is not required.
71
+
72
+ The current real-DB fixture is a smoke test for containment and SQL behavior. It is not an
73
+ end-to-end dbt/warehouse execution harness and should not be represented as one.
74
+
75
+ ## Benchmark Integrity
76
+
77
+ Current limitations:
78
+
79
+ - no blinded externally authored held-out suite is shipped;
80
+ - no verified real incident reconstructions are shipped;
81
+ - synthetic domains may miss organization-specific policy nuance;
82
+ - generated mutants share the public operator taxonomy;
83
+ - baseline comparators are simple observability controls, not independent production test suites;
84
+ - bounded witness reduction is not full delta debugging or source-code root-cause localization.
85
+
86
+ External validation should follow `docs/external-suite-protocol.md` and, for real incidents,
87
+ `docs/incident-reconstruction-template.md`.
88
+
89
+ ## Model-In-The-Loop Use
90
+
91
+ Model-mediated experiments are a reachability layer on top of deterministic conformance. They
92
+ should report reliability separately from capability:
93
+
94
+ - `reachability@k`: at least one of `k` attempts reached a witness.
95
+ - `policy_pass^k`: all `k` independent attempts respected the policy.
96
+ - `release_safe^k`: all `k` independent attempts avoided unsafe release.
97
+
98
+ Do not mix these with deterministic mutant kill rate.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Raintree Technology
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ include README.md
2
+ include action.yml
3
+ include EVAL_CARD.md
4
+ include CHANGELOG.md
5
+ include CONTRIBUTING.md
6
+ include SECURITY.md
7
+ include CITATION.cff
8
+ include LICENSE
9
+ include docker-compose.yml
10
+ include uv.lock
11
+ recursive-include docs *.md
12
+ recursive-include examples *.jsonl *.yaml *.yml
13
+ recursive-include scripts *.py *.sh
14
+ recursive-include tests *.py
15
+ recursive-include src/policystrata/domains *.sql *.yaml
@@ -0,0 +1,324 @@
1
+ Metadata-Version: 2.4
2
+ Name: policystrata
3
+ Version: 0.1.0
4
+ Summary: Cross-layer policy regression testing for LLM data-agent stacks
5
+ Author-email: Raintree Technology <support@raintree.technology>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/raintree-technology/policystrata
8
+ Project-URL: Repository, https://github.com/raintree-technology/policystrata
9
+ Project-URL: Documentation, https://github.com/raintree-technology/policystrata#readme
10
+ Project-URL: Paper, https://raintree.technology/papers
11
+ Project-URL: Changelog, https://github.com/raintree-technology/policystrata/blob/main/CHANGELOG.md
12
+ Project-URL: Issues, https://github.com/raintree-technology/policystrata/issues
13
+ Keywords: llm,text-to-sql,data-agents,policy-testing
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Environment :: Console
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Testing
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: pydantic>=2.0
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: psycopg[binary]>=3.2
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=8.0; extra == "dev"
33
+ Requires-Dist: hypothesis>=6.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.8.0; extra == "dev"
35
+ Requires-Dist: mypy>=1.0; extra == "dev"
36
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # PolicyStrata
40
+
41
+ PolicyStrata is a deterministic regression-testing framework for cross-layer policy drift in LLM
42
+ data-agent stacks.
43
+
44
+ It generates principals, requests, semantic plans, database states, lowered queries, and release
45
+ decisions; compares each layer against a canonical reference policy; and minimizes failures into
46
+ small reproducible witnesses.
47
+
48
+ Use it when you are building text-to-SQL agents, BI copilots, internal analytics agents, warehouse
49
+ chat systems, or governed enterprise LLM tools and need to know whether prompts, manifests,
50
+ semantic plans, validators, SQL compilers, database controls, and output filters still agree about
51
+ policy.
52
+
53
+ PolicyStrata is not an authorization boundary, and it is not another generic text-to-SQL benchmark.
54
+ It is a reproducible research artifact and regression gate for finding reachable disagreements
55
+ between layers.
56
+
57
+ ## Quick Start
58
+
59
+ From PyPI:
60
+
61
+ ```bash
62
+ uvx policystrata demo
63
+ pipx run policystrata demo
64
+ ```
65
+
66
+ From a source checkout:
67
+
68
+ ```bash
69
+ uv sync --extra dev
70
+ uv run policystrata demo
71
+ ```
72
+
73
+ The demo runs the built-in `support_saas` fixture, writes traces and minimized witnesses to
74
+ `runs/demo`, and prints the drift classes it found. Use `--out` to choose another output directory:
75
+
76
+ ```bash
77
+ uv run policystrata demo --out runs/demo
78
+ ```
79
+
80
+ No LLM API key is required for deterministic tests, benchmark runs, or the built-in demo.
81
+
82
+ ## Install
83
+
84
+ PolicyStrata is a CLI-first Python package. The public package provides the `policystrata` console
85
+ script and importable Python modules.
86
+
87
+ ```bash
88
+ python -m pip install policystrata
89
+ policystrata demo
90
+ ```
91
+
92
+ For one-off CLI use without managing an environment:
93
+
94
+ ```bash
95
+ uvx policystrata demo
96
+ pipx run policystrata demo
97
+ ```
98
+
99
+ Repository examples under `examples/`, Docker Compose fixtures, and evidence scripts are available
100
+ from a GitHub checkout or source distribution. The wheel installs the runtime package and built-in
101
+ domain fixtures used by `policystrata demo`, `run`, `init-domain`, and `scan`.
102
+
103
+ ## Use As A Template
104
+
105
+ Click **Use this template** on GitHub, then start with the deterministic fixtures:
106
+
107
+ ```bash
108
+ uv sync --extra dev
109
+ uv run policystrata run --domain support_saas --suite seeded --out runs/example
110
+ uv run policystrata summarize runs/example
111
+ ```
112
+
113
+ To copy a built-in domain fixture into your tree:
114
+
115
+ ```bash
116
+ uv run policystrata init-domain support_saas --out examples/my-policystrata-domain
117
+ ```
118
+
119
+ Keep custom integrations as adapters. The policy oracle should stay independent from SQL compiler
120
+ behavior, external eval frameworks, and model-provider behavior.
121
+
122
+ ## What It Tests
123
+
124
+ The core failure class is cross-layer policy drift:
125
+
126
+ ```text
127
+ Canonical policy:
128
+ Analysts may view tenant-scoped aggregate ticket counts, but not customer-level PII.
129
+
130
+ Model-visible manifest or grammar:
131
+ Accidentally exposes customer_email as a dimension.
132
+
133
+ SQL compiler:
134
+ Accidentally drops the tenant predicate while lowering an authorized aggregate.
135
+
136
+ Output layer:
137
+ Releases the result because the final answer looks like a summary.
138
+
139
+ PolicyStrata result:
140
+ A minimized witness localizes the violated layer and failed obligation.
141
+ ```
142
+
143
+ PolicyStrata does not assume every layer should behave identically. Each surface has a declared
144
+ responsibility:
145
+
146
+ - `manifest`: expose model-visible capabilities without stale or forbidden options.
147
+ - `grammar`: parse the declared intent space and preserve untrusted intent for validation.
148
+ - `validator`: authorize semantic queries and bind principal, tenant, time, and budget obligations.
149
+ - `compiler`: lower authorized semantic IR into SQL while preserving metric, tenant, time, and row
150
+ obligations.
151
+ - `database`: contain row access with RLS and other database-side controls.
152
+ - `release`: withhold contained or unauthorized results.
153
+
154
+ See [docs/failure-taxonomy.md](docs/failure-taxonomy.md) for how witness classes map to concrete
155
+ policy-drift failures.
156
+
157
+ ## Run Benchmarks
158
+
159
+ PolicyStrata ships with deterministic `support_saas` and `finance_saas` benchmarks, generated
160
+ mutation suites, minimized witnesses, JSONL traces, baseline comparisons, and evidence tables.
161
+
162
+ ```bash
163
+ uv run policystrata run --domain support_saas --suite seeded --out runs/example
164
+ uv run policystrata run \
165
+ --domain support_saas \
166
+ --suite generated \
167
+ --count 500 \
168
+ --seed 1729 \
169
+ --out runs/generated
170
+ uv run policystrata run --domain finance_saas --suite seeded --out runs/finance
171
+ uv run policystrata baselines runs/example
172
+ ```
173
+
174
+ The default `run` command writes:
175
+
176
+ ```text
177
+ runs/<id>/traces.jsonl
178
+ runs/<id>/summary.json
179
+ runs/<id>/metadata.json
180
+ runs/<id>/witnesses/*.json
181
+ ```
182
+
183
+ `metadata.json` records the mutation operator set, suite provenance, evidence level, and
184
+ detector-freeze status. Static suite YAML can declare `suite_metadata` so externally authored,
185
+ detector-frozen, or incident-reconstruction cases stay separate from public/generated benchmark
186
+ scores.
187
+
188
+ Regenerate paper-style evidence tables with:
189
+
190
+ ```bash
191
+ scripts/reproduce-evidence.sh
192
+ ```
193
+
194
+ Generate reviewer-facing artifact metrics for a run:
195
+
196
+ ```bash
197
+ uv run policystrata artifact-report runs/repro/seeded
198
+ ```
199
+
200
+ Current benchmark details are in [docs/evidence.md](docs/evidence.md), with methodology and claim
201
+ boundaries in [docs/methodology.md](docs/methodology.md) and [EVAL_CARD.md](EVAL_CARD.md).
202
+
203
+ ## Run The Scanner
204
+
205
+ `policystrata scan` is the production-oriented path. It treats PolicyStrata as a scanner and
206
+ release gate, not as the authorization boundary.
207
+
208
+ Clean smoke test:
209
+
210
+ ```bash
211
+ uv run policystrata scan --config examples/postgres_dbt/policystrata_clean.yaml --out runs/scan-clean
212
+ ```
213
+
214
+ Intentional gate-failure fixture:
215
+
216
+ ```bash
217
+ uv run policystrata scan --config examples/postgres_dbt/policystrata.yaml --out runs/scan
218
+ ```
219
+
220
+ That fixture should exit `1` because it contains imported traces with known authorization,
221
+ unsafe-release, and tenant-scope findings.
222
+
223
+ Scanner outputs include:
224
+
225
+ ```text
226
+ runs/scan-clean/scan.json
227
+ runs/scan-clean/findings.jsonl
228
+ runs/scan-clean/summary.json
229
+ runs/scan-clean/report.md
230
+ runs/scan-clean/witnesses/*.json
231
+ runs/scan-clean/scan.sarif # when sarif: true
232
+ ```
233
+
234
+ For a scanner run that also executes imported SQL beside canonical compiler SQL against the
235
+ Docker/PostgreSQL fixture:
236
+
237
+ ```bash
238
+ docker compose up -d postgres
239
+ uv run policystrata scan --config examples/postgres_dbt/policystrata_real_db_clean.yaml --out runs/scan-real-db-clean
240
+ ```
241
+
242
+ Postgres access goes through Python/`psycopg`; host `psql` is not required. See
243
+ [docs/scanner.md](docs/scanner.md) for scanner configuration, gate behavior, state assertions, and
244
+ real-database fixture details.
245
+
246
+ ## GitHub Action
247
+
248
+ Use the first-party action to run `policystrata scan` as a pull-request or release gate:
249
+
250
+ ```yaml
251
+ name: PolicyStrata
252
+
253
+ on:
254
+ pull_request:
255
+ push:
256
+ branches: [main]
257
+
258
+ jobs:
259
+ scan:
260
+ runs-on: ubuntu-latest
261
+ steps:
262
+ - uses: actions/checkout@v4
263
+
264
+ - uses: raintree-technology/policystrata@v0.1.0
265
+ with:
266
+ config: policystrata.yaml
267
+ out: runs/policystrata
268
+ ```
269
+
270
+ See [docs/github-action.md](docs/github-action.md) for inputs, artifact upload, and database
271
+ fixture guidance.
272
+
273
+ ## Integrations And Exports
274
+
275
+ PolicyStrata keeps core execution independent from external eval frameworks. Adapter exports are
276
+ available for downstream systems:
277
+
278
+ ```bash
279
+ uv run policystrata export runs/example --format inspect --out runs/example/inspect.jsonl
280
+ uv run policystrata export runs/example --format benchflow --out runs/example/benchflow.json
281
+ ```
282
+
283
+ The repo also includes a small dbt Semantic Layer adapter and fixture:
284
+
285
+ ```bash
286
+ uv run policystrata check-integration dbt-semantic \
287
+ --domain finance_saas \
288
+ --path examples/integrations/dbt_semantic/finance_saas/semantic_models.yml
289
+ ```
290
+
291
+ See [docs/trace-interop.md](docs/trace-interop.md) for adapter field mappings.
292
+
293
+ ## Reference Docs
294
+
295
+ - [docs/benchmark-reference.md](docs/benchmark-reference.md): domains, generated mutants,
296
+ baselines, and witness shape.
297
+ - [docs/scanner.md](docs/scanner.md): scanner inputs, gates, state assertions, and PostgreSQL
298
+ fixture use.
299
+ - [docs/github-action.md](docs/github-action.md): CI wrapper for `policystrata scan`.
300
+ - [docs/distribution-roadmap.md](docs/distribution-roadmap.md): CLI, GitHub Action, SDK, MCP, and
301
+ GitHub CLI extension sequence.
302
+ - [docs/evidence.md](docs/evidence.md): current evidence snapshot and reproduction commands.
303
+ - [docs/methodology.md](docs/methodology.md): claims, limitations, mutant definitions, and witness
304
+ minimization.
305
+ - [EVAL_CARD.md](EVAL_CARD.md): benchmark provenance, evidence levels, and eval boundaries.
306
+ - [docs/open-source-commercial-strategy.md](docs/open-source-commercial-strategy.md): packaging and
307
+ product boundary.
308
+
309
+ ## Development
310
+
311
+ ```bash
312
+ uv run pytest
313
+ uv run ruff check .
314
+ uv run mypy src
315
+ ```
316
+
317
+ The built-in `support_saas` domain is deterministic and seed-driven. Preserve JSON/YAML trace
318
+ stability when extending artifacts; add fields compatibly.
319
+
320
+ ## Status
321
+
322
+ PolicyStrata is an early research artifact. It is useful for reproducing the paper's core failure
323
+ model and for building regression gates around real stacks. It does not prove recall on unknown
324
+ production incidents, and it should not be represented as a production security scanner by itself.