dataxplan 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dataxplan-0.1.0/LICENSE +21 -0
  2. dataxplan-0.1.0/PKG-INFO +210 -0
  3. dataxplan-0.1.0/README.md +178 -0
  4. dataxplan-0.1.0/dataxplan/__init__.py +45 -0
  5. dataxplan-0.1.0/dataxplan/__main__.py +6 -0
  6. dataxplan-0.1.0/dataxplan/_result.py +83 -0
  7. dataxplan-0.1.0/dataxplan/_version.py +1 -0
  8. dataxplan-0.1.0/dataxplan/cli.py +66 -0
  9. dataxplan-0.1.0/dataxplan/compare.py +115 -0
  10. dataxplan-0.1.0/dataxplan/context.py +62 -0
  11. dataxplan-0.1.0/dataxplan/findings.py +164 -0
  12. dataxplan-0.1.0/dataxplan/metrics.py +104 -0
  13. dataxplan-0.1.0/dataxplan/parse.py +195 -0
  14. dataxplan-0.1.0/dataxplan/py.typed +0 -0
  15. dataxplan-0.1.0/dataxplan/render.py +83 -0
  16. dataxplan-0.1.0/dataxplan/report.py +112 -0
  17. dataxplan-0.1.0/dataxplan/run.py +43 -0
  18. dataxplan-0.1.0/dataxplan.egg-info/PKG-INFO +210 -0
  19. dataxplan-0.1.0/dataxplan.egg-info/SOURCES.txt +33 -0
  20. dataxplan-0.1.0/dataxplan.egg-info/dependency_links.txt +1 -0
  21. dataxplan-0.1.0/dataxplan.egg-info/entry_points.txt +2 -0
  22. dataxplan-0.1.0/dataxplan.egg-info/requires.txt +9 -0
  23. dataxplan-0.1.0/dataxplan.egg-info/top_level.txt +1 -0
  24. dataxplan-0.1.0/pyproject.toml +53 -0
  25. dataxplan-0.1.0/setup.cfg +4 -0
  26. dataxplan-0.1.0/tests/test_cli.py +48 -0
  27. dataxplan-0.1.0/tests/test_compare.py +36 -0
  28. dataxplan-0.1.0/tests/test_context.py +25 -0
  29. dataxplan-0.1.0/tests/test_examples.py +28 -0
  30. dataxplan-0.1.0/tests/test_findings.py +39 -0
  31. dataxplan-0.1.0/tests/test_metrics.py +47 -0
  32. dataxplan-0.1.0/tests/test_parse.py +46 -0
  33. dataxplan-0.1.0/tests/test_render.py +25 -0
  34. dataxplan-0.1.0/tests/test_report.py +49 -0
  35. dataxplan-0.1.0/tests/test_run.py +58 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Atakan Arikan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,210 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataxplan
3
+ Version: 0.1.0
4
+ Summary: Read PostgreSQL EXPLAIN plans locally: parse the plan, compute self time and estimation error, flag documented problems, compare plans, and guard them in CI. No database connection, deterministic.
5
+ Author: Atakan Arikan
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/arikanatakan/dataxplan
8
+ Project-URL: Repository, https://github.com/arikanatakan/dataxplan
9
+ Project-URL: Issues, https://github.com/arikanatakan/dataxplan/issues
10
+ Keywords: postgresql,postgres,explain,query-plan,performance,query-optimization,database,sql,explain-analyze,dba
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Software Development :: Quality Assurance
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: viz
25
+ Requires-Dist: matplotlib>=3.5; extra == "viz"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7; extra == "dev"
28
+ Requires-Dist: ruff; extra == "dev"
29
+ Requires-Dist: build; extra == "dev"
30
+ Requires-Dist: matplotlib>=3.5; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # dataxplan
34
+
35
+ [![CI](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml/badge.svg)](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml)
36
+ [![PyPI](https://img.shields.io/pypi/v/dataxplan?v=1)](https://pypi.org/project/dataxplan/)
37
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
38
+
39
+ Read PostgreSQL `EXPLAIN` plans from Python: parse the plan, compute the numbers
40
+ people misread (self time and estimation error), flag documented problems,
41
+ compare plans, and guard them in CI. **No database connection, nothing leaves
42
+ your machine, deterministic output.**
43
+
44
+ You give it the output of `EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`; it does
45
+ the rest locally.
46
+
47
+ ![dataxplan framework: an EXPLAIN JSON plan (and optional catalog context) flows through parse, metrics and findings into a Report you can summarise, assert on in CI, turn into JSON, compare against another plan, or render as a text tree or chart; no database connection and deterministic](assets/framework.png)
48
+
49
+ It turns a plan into a deterministic read (here a query whose join was estimated
50
+ at 5 rows but produced 500,000):
51
+
52
+ ```text
53
+ dataxplan
54
+ execution time 1,505.00 ms (planning 0.30 ms)
55
+ nodes 3, depth 1
56
+ worst row estimate 100000x off
57
+ top by self time:
58
+ Index Scan on b 1,000.00 ms (66%)
59
+ Nested Loop 450.00 ms (30%)
60
+ findings:
61
+ [HIGH] Row estimate is far off (Nested Loop)
62
+ estimated 5 rows, actual 500,000 (100000x under-estimate)
63
+ -> run ANALYZE; if the columns are correlated consider extended statistics
64
+ [MEDIUM] Nested loop with many iterations (Nested Loop)
65
+ the inner side executed 500,000 times
66
+ -> usually an under-estimate upstream; a hash or merge join may be cheaper
67
+ ```
68
+
69
+ ## Why
70
+
71
+ Reading a plan by hand is error-prone (self time is per-loop and inclusive of
72
+ children, so the slow node is rarely the obvious one). The good tools that do
73
+ this are web pastebins (your production plan leaves your machine) or commercial
74
+ SaaS. dataxplan is local, free, programmatic and embeddable: run it in a script,
75
+ a notebook, your CI, or later an MCP server, and keep the plan in your own
76
+ environment.
77
+
78
+ ```bash
79
+ pip install dataxplan
80
+ ```
81
+
82
+ No runtime dependencies. The chart is optional (`pip install "dataxplan[viz]"`).
83
+
84
+ ## Quick start
85
+
86
+ ```python
87
+ import dataxplan
88
+
89
+ report = dataxplan.analyze(explain_json) # the EXPLAIN (FORMAT JSON) output
90
+ print(report.summary()) # the summary shown above
91
+ ```
92
+
93
+ ### From the command line
94
+
95
+ ```bash
96
+ dataxplan plan.json # summary
97
+ dataxplan plan.json --tree # also the annotated plan tree
98
+ dataxplan plan.json --json # the full report as JSON
99
+ dataxplan before.json --compare after.json
100
+ psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) <query>" | dataxplan
101
+ ```
102
+
103
+ ### Guard a plan in CI
104
+
105
+ Pin a critical query's plan in your test suite, so a code or schema change that
106
+ makes it regress fails the build. Nothing else in Python does this.
107
+
108
+ ```python
109
+ def test_orders_lookup_stays_fast():
110
+ report = dataxplan.analyze(get_explain("SELECT * FROM orders WHERE customer_id = %s"))
111
+ assert not report.has_seq_scan_on("orders")
112
+ assert report.max_estimation_error < 100
113
+ assert not report.spilled_to_disk
114
+ ```
115
+
116
+ ### Compare two plans (before / after an index)
117
+
118
+ ```python
119
+ print(dataxplan.compare(before_json, after_json).summary())
120
+ # dataxplan compare - IMPROVED
121
+ # execution time 905.00 ms -> 0.08 ms (-100%)
122
+ # resolved filter_discard, seq_scan_hot
123
+ ```
124
+
125
+ ### Sharper findings with catalog context (optional)
126
+
127
+ ```python
128
+ from dataxplan import Context, TableInfo
129
+ ctx = Context(tables={"orders": TableInfo("orders", row_count=10_000_000,
130
+ indexed_columns=("id",))})
131
+ dataxplan.analyze(explain_json, context=ctx)
132
+ ```
133
+
134
+ ### Fetch a plan from a connection you already have (optional)
135
+
136
+ ```python
137
+ plan = dataxplan.run_explain(conn, "SELECT * FROM orders WHERE id = %s", params=(42,))
138
+ dataxplan.analyze(plan)
139
+ ```
140
+
141
+ `run_explain` calls `cursor.execute` on a DB-API connection you pass (psycopg,
142
+ psycopg2, ...); dataxplan does not depend on any driver. With `analyze=True` it
143
+ runs the query, so use `analyze=False` for a plan-only estimate.
144
+
145
+ ## What it covers
146
+
147
+ | Area | What you get |
148
+ | --- | --- |
149
+ | Parse | `parse` -> a typed `Plan` / `PlanNode` tree from EXPLAIN (FORMAT JSON) |
150
+ | Metrics | self (exclusive) time, % of total, estimation error, disk spills, buffers |
151
+ | Findings | hot sequential scans, large row mis-estimates, disk spills, filter discards, nested-loop blow-ups, index-only heap fetches, lossy bitmaps, JIT overhead |
152
+ | Report | `summary`, `to_dict`, and an assertion API (`has_seq_scan_on`, `max_estimation_error`, `spilled_to_disk`, `ok`) for CI |
153
+ | Compare | `compare` two plans for regression (timing, shape, estimates, findings) |
154
+ | Render | `text_tree` (annotated, dependency-free) and `plan_tree_chart` (needs matplotlib) |
155
+ | Context | optional catalog metadata (sizes, indexes, stale stats) that sharpens findings |
156
+ | CLI | `dataxplan plan.json` (or stdin): summary, `--tree`, `--json`, `--compare` |
157
+
158
+ ## Examples
159
+
160
+ Four plans from public datasets and benchmarks, each showing a different problem,
161
+ are in [`examples/`](examples/): the IMDB / Join Order Benchmark (a row
162
+ mis-estimate), the NYC TLC taxi trips (a sort that spills to disk), TPC-H
163
+ `lineitem` (a hot scan discarding most rows), and the Bosch Production Line
164
+ Performance manufacturing data set (a hash join with a large mis-estimate). For
165
+ instance:
166
+
167
+ ```bash
168
+ dataxplan examples/job_imdb_misestimate.json
169
+ ```
170
+
171
+ ## What is out of scope
172
+
173
+ dataxplan analyses the **plan you give it**. By default it does not connect to a
174
+ database, run your queries, or read your schema, so a finding is a **documented
175
+ heuristic, not a guarantee**, and the suggestions are based on the plan alone. It
176
+ does not rewrite SQL or invent a cost model. It targets PostgreSQL
177
+ `FORMAT JSON` output (MySQL may follow).
178
+
179
+ ## How the headline metrics work
180
+
181
+ - **Self time.** Postgres reports `Actual Total Time` per loop and inclusive of
182
+ children, so a node's total is `Actual Total Time x Actual Loops`, and its
183
+ self time is that minus the children's totals. Self time is where the work
184
+ really happens.
185
+ - **Estimation error.** `Plan Rows` against `Actual Rows` (per loop); a large
186
+ ratio is the usual root cause of a bad plan.
187
+
188
+ ## References and validation
189
+
190
+ The metric arithmetic is verified by hand against the semantics PostgreSQL
191
+ documents (see [`tests/`](tests/)), and the heuristics are grounded in primary
192
+ and academic sources, written in our own words:
193
+
194
+ - [PostgreSQL documentation: EXPLAIN](https://www.postgresql.org/docs/current/sql-explain.html)
195
+ - [PostgreSQL wiki: Using EXPLAIN](https://wiki.postgresql.org/wiki/Using_EXPLAIN)
196
+ - V. Leis, A. Gubichev, A. Mirchev, P. Boncz, A. Kemper, T. Neumann, "How Good
197
+ Are Query Optimizers, Really?", Proceedings of the VLDB Endowment 9(3), 2015 -
198
+ the study of cardinality mis-estimation behind the Join Order Benchmark, which
199
+ the `estimate_off` finding detects (see the JOB example in
200
+ [`examples/`](examples/)).
201
+
202
+ Plan analysis has no single numeric ground truth the way a closed-form formula
203
+ does, so the claim here is deliberately narrow: the parsing and arithmetic are
204
+ correct against the documented format, and each heuristic cites the behaviour it
205
+ relies on.
206
+
207
+ ## License
208
+
209
+ MIT. Written and maintained by [Atakan Arikan](https://github.com/arikanatakan),
210
+ MSc Student at Tsinghua University and Politecnico di Milano.
@@ -0,0 +1,178 @@
1
+ # dataxplan
2
+
3
+ [![CI](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml/badge.svg)](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/dataxplan?v=1)](https://pypi.org/project/dataxplan/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
6
+
7
+ Read PostgreSQL `EXPLAIN` plans from Python: parse the plan, compute the numbers
8
+ people misread (self time and estimation error), flag documented problems,
9
+ compare plans, and guard them in CI. **No database connection, nothing leaves
10
+ your machine, deterministic output.**
11
+
12
+ You give it the output of `EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`; it does
13
+ the rest locally.
14
+
15
+ ![dataxplan framework: an EXPLAIN JSON plan (and optional catalog context) flows through parse, metrics and findings into a Report you can summarise, assert on in CI, turn into JSON, compare against another plan, or render as a text tree or chart; no database connection and deterministic](assets/framework.png)
16
+
17
+ It turns a plan into a deterministic read (here a query whose join was estimated
18
+ at 5 rows but produced 500,000):
19
+
20
+ ```text
21
+ dataxplan
22
+ execution time 1,505.00 ms (planning 0.30 ms)
23
+ nodes 3, depth 1
24
+ worst row estimate 100000x off
25
+ top by self time:
26
+ Index Scan on b 1,000.00 ms (66%)
27
+ Nested Loop 450.00 ms (30%)
28
+ findings:
29
+ [HIGH] Row estimate is far off (Nested Loop)
30
+ estimated 5 rows, actual 500,000 (100000x under-estimate)
31
+ -> run ANALYZE; if the columns are correlated consider extended statistics
32
+ [MEDIUM] Nested loop with many iterations (Nested Loop)
33
+ the inner side executed 500,000 times
34
+ -> usually an under-estimate upstream; a hash or merge join may be cheaper
35
+ ```
36
+
37
+ ## Why
38
+
39
+ Reading a plan by hand is error-prone (self time is per-loop and inclusive of
40
+ children, so the slow node is rarely the obvious one). The good tools that do
41
+ this are web pastebins (your production plan leaves your machine) or commercial
42
+ SaaS. dataxplan is local, free, programmatic and embeddable: run it in a script,
43
+ a notebook, your CI, or later an MCP server, and keep the plan in your own
44
+ environment.
45
+
46
+ ```bash
47
+ pip install dataxplan
48
+ ```
49
+
50
+ No runtime dependencies. The chart is optional (`pip install "dataxplan[viz]"`).
51
+
52
+ ## Quick start
53
+
54
+ ```python
55
+ import dataxplan
56
+
57
+ report = dataxplan.analyze(explain_json) # the EXPLAIN (FORMAT JSON) output
58
+ print(report.summary()) # the summary shown above
59
+ ```
60
+
61
+ ### From the command line
62
+
63
+ ```bash
64
+ dataxplan plan.json # summary
65
+ dataxplan plan.json --tree # also the annotated plan tree
66
+ dataxplan plan.json --json # the full report as JSON
67
+ dataxplan before.json --compare after.json
68
+ psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) <query>" | dataxplan
69
+ ```
70
+
71
+ ### Guard a plan in CI
72
+
73
+ Pin a critical query's plan in your test suite, so a code or schema change that
74
+ makes it regress fails the build. Nothing else in Python does this.
75
+
76
+ ```python
77
+ def test_orders_lookup_stays_fast():
78
+ report = dataxplan.analyze(get_explain("SELECT * FROM orders WHERE customer_id = %s"))
79
+ assert not report.has_seq_scan_on("orders")
80
+ assert report.max_estimation_error < 100
81
+ assert not report.spilled_to_disk
82
+ ```
83
+
84
+ ### Compare two plans (before / after an index)
85
+
86
+ ```python
87
+ print(dataxplan.compare(before_json, after_json).summary())
88
+ # dataxplan compare - IMPROVED
89
+ # execution time 905.00 ms -> 0.08 ms (-100%)
90
+ # resolved filter_discard, seq_scan_hot
91
+ ```
92
+
93
+ ### Sharper findings with catalog context (optional)
94
+
95
+ ```python
96
+ from dataxplan import Context, TableInfo
97
+ ctx = Context(tables={"orders": TableInfo("orders", row_count=10_000_000,
98
+ indexed_columns=("id",))})
99
+ dataxplan.analyze(explain_json, context=ctx)
100
+ ```
101
+
102
+ ### Fetch a plan from a connection you already have (optional)
103
+
104
+ ```python
105
+ plan = dataxplan.run_explain(conn, "SELECT * FROM orders WHERE id = %s", params=(42,))
106
+ dataxplan.analyze(plan)
107
+ ```
108
+
109
+ `run_explain` calls `cursor.execute` on a DB-API connection you pass (psycopg,
110
+ psycopg2, ...); dataxplan does not depend on any driver. With `analyze=True` it
111
+ runs the query, so use `analyze=False` for a plan-only estimate.
112
+
113
+ ## What it covers
114
+
115
+ | Area | What you get |
116
+ | --- | --- |
117
+ | Parse | `parse` -> a typed `Plan` / `PlanNode` tree from EXPLAIN (FORMAT JSON) |
118
+ | Metrics | self (exclusive) time, % of total, estimation error, disk spills, buffers |
119
+ | Findings | hot sequential scans, large row mis-estimates, disk spills, filter discards, nested-loop blow-ups, index-only heap fetches, lossy bitmaps, JIT overhead |
120
+ | Report | `summary`, `to_dict`, and an assertion API (`has_seq_scan_on`, `max_estimation_error`, `spilled_to_disk`, `ok`) for CI |
121
+ | Compare | `compare` two plans for regression (timing, shape, estimates, findings) |
122
+ | Render | `text_tree` (annotated, dependency-free) and `plan_tree_chart` (needs matplotlib) |
123
+ | Context | optional catalog metadata (sizes, indexes, stale stats) that sharpens findings |
124
+ | CLI | `dataxplan plan.json` (or stdin): summary, `--tree`, `--json`, `--compare` |
125
+
126
+ ## Examples
127
+
128
+ Four plans from public datasets and benchmarks, each showing a different problem,
129
+ are in [`examples/`](examples/): the IMDB / Join Order Benchmark (a row
130
+ mis-estimate), the NYC TLC taxi trips (a sort that spills to disk), TPC-H
131
+ `lineitem` (a hot scan discarding most rows), and the Bosch Production Line
132
+ Performance manufacturing data set (a hash join with a large mis-estimate). For
133
+ instance:
134
+
135
+ ```bash
136
+ dataxplan examples/job_imdb_misestimate.json
137
+ ```
138
+
139
+ ## What is out of scope
140
+
141
+ dataxplan analyses the **plan you give it**. By default it does not connect to a
142
+ database, run your queries, or read your schema, so a finding is a **documented
143
+ heuristic, not a guarantee**, and the suggestions are based on the plan alone. It
144
+ does not rewrite SQL or invent a cost model. It targets PostgreSQL
145
+ `FORMAT JSON` output (MySQL may follow).
146
+
147
+ ## How the headline metrics work
148
+
149
+ - **Self time.** Postgres reports `Actual Total Time` per loop and inclusive of
150
+ children, so a node's total is `Actual Total Time x Actual Loops`, and its
151
+ self time is that minus the children's totals. Self time is where the work
152
+ really happens.
153
+ - **Estimation error.** `Plan Rows` against `Actual Rows` (per loop); a large
154
+ ratio is the usual root cause of a bad plan.
155
+
156
+ ## References and validation
157
+
158
+ The metric arithmetic is verified by hand against the semantics PostgreSQL
159
+ documents (see [`tests/`](tests/)), and the heuristics are grounded in primary
160
+ and academic sources, written in our own words:
161
+
162
+ - [PostgreSQL documentation: EXPLAIN](https://www.postgresql.org/docs/current/sql-explain.html)
163
+ - [PostgreSQL wiki: Using EXPLAIN](https://wiki.postgresql.org/wiki/Using_EXPLAIN)
164
+ - V. Leis, A. Gubichev, A. Mirchev, P. Boncz, A. Kemper, T. Neumann, "How Good
165
+ Are Query Optimizers, Really?", Proceedings of the VLDB Endowment 9(3), 2015 -
166
+ the study of cardinality mis-estimation behind the Join Order Benchmark, which
167
+ the `estimate_off` finding detects (see the JOB example in
168
+ [`examples/`](examples/)).
169
+
170
+ Plan analysis has no single numeric ground truth the way a closed-form formula
171
+ does, so the claim here is deliberately narrow: the parsing and arithmetic are
172
+ correct against the documented format, and each heuristic cites the behaviour it
173
+ relies on.
174
+
175
+ ## License
176
+
177
+ MIT. Written and maintained by [Atakan Arikan](https://github.com/arikanatakan),
178
+ MSc Student at Tsinghua University and Politecnico di Milano.
@@ -0,0 +1,45 @@
1
+ """dataxplan - read PostgreSQL EXPLAIN plans, locally and deterministically.
2
+
3
+ Give it the output of ``EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`` and it
4
+ parses the plan, computes the metrics people misread (self time, estimation
5
+ error, disk spills), and flags documented problems. No database connection is
6
+ required and nothing leaves your machine.
7
+
8
+ import dataxplan
9
+
10
+ report = dataxplan.analyze(explain_json)
11
+ print(report.summary())
12
+
13
+ # guard a plan in a test (fail CI if it regresses)
14
+ assert not report.has_seq_scan_on("orders")
15
+ assert report.max_estimation_error < 100
16
+ assert not report.spilled_to_disk
17
+
18
+ # compare two plans (before/after an index)
19
+ print(dataxplan.compare(before_json, after_json).summary())
20
+
21
+ The findings are documented heuristics, not guarantees, and the analysis is of
22
+ the plan you provide; it does not run your queries or read your schema unless you
23
+ choose to supply catalog context.
24
+ """
25
+
26
+ from ._result import Finding
27
+ from ._version import __version__
28
+ from .compare import Comparison, compare
29
+ from .context import Context, TableInfo
30
+ from .metrics import NodeMetrics
31
+ from .parse import Plan, PlanNode, parse
32
+ from .render import plan_tree_chart, text_tree
33
+ from .report import Report, analyze
34
+ from .run import run_explain
35
+
36
+ __all__ = [
37
+ # core flow
38
+ "parse", "analyze", "compare",
39
+ # types
40
+ "Plan", "PlanNode", "Report", "NodeMetrics", "Finding", "Comparison",
41
+ "Context", "TableInfo",
42
+ # render and helpers
43
+ "text_tree", "plan_tree_chart", "run_explain",
44
+ "__version__",
45
+ ]
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
@@ -0,0 +1,83 @@
1
+ """Shared plumbing: provenance (version, input hash, timestamp), the ``Finding``
2
+ type, and small formatting helpers. Every public result carries a meta block so
3
+ an analysis can be reproduced and audited later.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import json
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+
13
+ from ._version import __version__
14
+
15
+ SCHEMA = 1
16
+
17
+ # Finding severities, most to least serious.
18
+ HIGH = "high"
19
+ MEDIUM = "medium"
20
+ LOW = "low"
21
+ INFO = "info"
22
+ _ORDER = {HIGH: 0, MEDIUM: 1, LOW: 2, INFO: 3}
23
+
24
+
25
+ def utcnow() -> str:
26
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
27
+
28
+
29
+ def data_hash(obj: object) -> str:
30
+ payload = json.dumps(obj, sort_keys=True, default=str).encode("utf-8")
31
+ return "sha256:" + hashlib.sha256(payload).hexdigest()[:16]
32
+
33
+
34
+ def make_meta(inputs: dict) -> dict:
35
+ """The provenance block stamped onto every result."""
36
+ return {
37
+ "library": "dataxplan",
38
+ "version": __version__,
39
+ "computed_at": utcnow(),
40
+ "input_hash": data_hash(inputs),
41
+ }
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class Finding:
46
+ """One observation about a plan: a documented heuristic, not a guarantee."""
47
+
48
+ id: str
49
+ severity: str # high | medium | low | info
50
+ title: str
51
+ detail: str
52
+ node: str | None = None # e.g. "Seq Scan on orders"
53
+ path: tuple[int, ...] | None = None
54
+ suggestion: str | None = None
55
+
56
+ @property
57
+ def rank(self) -> int:
58
+ return _ORDER.get(self.severity, 9)
59
+
60
+ def __str__(self) -> str:
61
+ head = f"[{self.severity.upper()}] {self.title}"
62
+ if self.node:
63
+ head += f" ({self.node})"
64
+ lines = [head, f" {self.detail}"]
65
+ if self.suggestion:
66
+ lines.append(f" -> {self.suggestion}")
67
+ return "\n".join(lines)
68
+
69
+ def to_dict(self) -> dict:
70
+ return {
71
+ "id": self.id, "severity": self.severity, "title": self.title,
72
+ "detail": self.detail, "node": self.node,
73
+ "path": list(self.path) if self.path else None,
74
+ "suggestion": self.suggestion,
75
+ }
76
+
77
+
78
+ def ms(value: float | None) -> str:
79
+ return "-" if value is None else f"{value:,.2f} ms"
80
+
81
+
82
+ def num(value: float | None, places: int = 0) -> str:
83
+ return "-" if value is None else f"{value:,.{places}f}"
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,66 @@
1
+ """Command-line interface: analyse a plan from a file or stdin.
2
+
3
+ dataxplan plan.json
4
+ dataxplan plan.json --tree
5
+ dataxplan plan.json --json
6
+ dataxplan before.json --compare after.json
7
+ psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ..." | dataxplan
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+
16
+ from . import analyze, compare, text_tree
17
+ from ._version import __version__
18
+
19
+
20
+ def _read(source: str) -> str:
21
+ if source in (None, "-"):
22
+ return sys.stdin.read()
23
+ with open(source, encoding="utf-8") as handle:
24
+ return handle.read()
25
+
26
+
27
+ def main(argv=None) -> int:
28
+ parser = argparse.ArgumentParser(
29
+ prog="dataxplan",
30
+ description="Analyse a PostgreSQL EXPLAIN (FORMAT JSON) plan, locally.")
31
+ parser.add_argument("plan", nargs="?", default="-",
32
+ help="plan file, or - for stdin (the default)")
33
+ parser.add_argument("--tree", action="store_true",
34
+ help="also print the annotated plan tree")
35
+ parser.add_argument("--json", action="store_true",
36
+ help="print the full report (or comparison) as JSON")
37
+ parser.add_argument("--compare", metavar="OTHER",
38
+ help="compare the plan against another plan file")
39
+ parser.add_argument("--version", action="version",
40
+ version=f"dataxplan {__version__}")
41
+ args = parser.parse_args(argv)
42
+
43
+ try:
44
+ plan = json.loads(_read(args.plan))
45
+ if args.compare:
46
+ result = compare(plan, json.loads(_read(args.compare)))
47
+ print(json.dumps(result.to_dict(), indent=2, default=str)
48
+ if args.json else result.summary())
49
+ return 0
50
+ report = analyze(plan)
51
+ except (ValueError, TypeError, OSError, json.JSONDecodeError) as exc:
52
+ print(f"dataxplan: {exc}", file=sys.stderr)
53
+ return 2
54
+
55
+ if args.json:
56
+ print(json.dumps(report.to_dict(), indent=2, default=str))
57
+ else:
58
+ print(report.summary())
59
+ if args.tree:
60
+ print("\nplan tree:")
61
+ print(text_tree(report))
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.exit(main())