dataxplan 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataxplan-0.1.0/LICENSE +21 -0
- dataxplan-0.1.0/PKG-INFO +210 -0
- dataxplan-0.1.0/README.md +178 -0
- dataxplan-0.1.0/dataxplan/__init__.py +45 -0
- dataxplan-0.1.0/dataxplan/__main__.py +6 -0
- dataxplan-0.1.0/dataxplan/_result.py +83 -0
- dataxplan-0.1.0/dataxplan/_version.py +1 -0
- dataxplan-0.1.0/dataxplan/cli.py +66 -0
- dataxplan-0.1.0/dataxplan/compare.py +115 -0
- dataxplan-0.1.0/dataxplan/context.py +62 -0
- dataxplan-0.1.0/dataxplan/findings.py +164 -0
- dataxplan-0.1.0/dataxplan/metrics.py +104 -0
- dataxplan-0.1.0/dataxplan/parse.py +195 -0
- dataxplan-0.1.0/dataxplan/py.typed +0 -0
- dataxplan-0.1.0/dataxplan/render.py +83 -0
- dataxplan-0.1.0/dataxplan/report.py +112 -0
- dataxplan-0.1.0/dataxplan/run.py +43 -0
- dataxplan-0.1.0/dataxplan.egg-info/PKG-INFO +210 -0
- dataxplan-0.1.0/dataxplan.egg-info/SOURCES.txt +33 -0
- dataxplan-0.1.0/dataxplan.egg-info/dependency_links.txt +1 -0
- dataxplan-0.1.0/dataxplan.egg-info/entry_points.txt +2 -0
- dataxplan-0.1.0/dataxplan.egg-info/requires.txt +9 -0
- dataxplan-0.1.0/dataxplan.egg-info/top_level.txt +1 -0
- dataxplan-0.1.0/pyproject.toml +53 -0
- dataxplan-0.1.0/setup.cfg +4 -0
- dataxplan-0.1.0/tests/test_cli.py +48 -0
- dataxplan-0.1.0/tests/test_compare.py +36 -0
- dataxplan-0.1.0/tests/test_context.py +25 -0
- dataxplan-0.1.0/tests/test_examples.py +28 -0
- dataxplan-0.1.0/tests/test_findings.py +39 -0
- dataxplan-0.1.0/tests/test_metrics.py +47 -0
- dataxplan-0.1.0/tests/test_parse.py +46 -0
- dataxplan-0.1.0/tests/test_render.py +25 -0
- dataxplan-0.1.0/tests/test_report.py +49 -0
- dataxplan-0.1.0/tests/test_run.py +58 -0
dataxplan-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Atakan Arikan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dataxplan-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataxplan
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Read PostgreSQL EXPLAIN plans locally: parse the plan, compute self time and estimation error, flag documented problems, compare plans, and guard them in CI. No database connection, deterministic.
|
|
5
|
+
Author: Atakan Arikan
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/arikanatakan/dataxplan
|
|
8
|
+
Project-URL: Repository, https://github.com/arikanatakan/dataxplan
|
|
9
|
+
Project-URL: Issues, https://github.com/arikanatakan/dataxplan/issues
|
|
10
|
+
Keywords: postgresql,postgres,explain,query-plan,performance,query-optimization,database,sql,explain-analyze,dba
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: viz
|
|
25
|
+
Requires-Dist: matplotlib>=3.5; extra == "viz"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff; extra == "dev"
|
|
29
|
+
Requires-Dist: build; extra == "dev"
|
|
30
|
+
Requires-Dist: matplotlib>=3.5; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# dataxplan
|
|
34
|
+
|
|
35
|
+
[](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml)
|
|
36
|
+
[](https://pypi.org/project/dataxplan/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
Read PostgreSQL `EXPLAIN` plans from Python: parse the plan, compute the numbers
|
|
40
|
+
people misread (self time and estimation error), flag documented problems,
|
|
41
|
+
compare plans, and guard them in CI. **No database connection, nothing leaves
|
|
42
|
+
your machine, deterministic output.**
|
|
43
|
+
|
|
44
|
+
You give it the output of `EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`; it does
|
|
45
|
+
the rest locally.
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+
|
|
49
|
+
It turns a plan into a deterministic read (here a query whose join was estimated
|
|
50
|
+
at 5 rows but produced 500,000):
|
|
51
|
+
|
|
52
|
+
```text
|
|
53
|
+
dataxplan
|
|
54
|
+
execution time 1,505.00 ms (planning 0.30 ms)
|
|
55
|
+
nodes 3, depth 1
|
|
56
|
+
worst row estimate 100000x off
|
|
57
|
+
top by self time:
|
|
58
|
+
Index Scan on b 1,000.00 ms (66%)
|
|
59
|
+
Nested Loop 450.00 ms (30%)
|
|
60
|
+
findings:
|
|
61
|
+
[HIGH] Row estimate is far off (Nested Loop)
|
|
62
|
+
estimated 5 rows, actual 500,000 (100000x under-estimate)
|
|
63
|
+
-> run ANALYZE; if the columns are correlated consider extended statistics
|
|
64
|
+
[MEDIUM] Nested loop with many iterations (Nested Loop)
|
|
65
|
+
the inner side executed 500,000 times
|
|
66
|
+
-> usually an under-estimate upstream; a hash or merge join may be cheaper
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Why
|
|
70
|
+
|
|
71
|
+
Reading a plan by hand is error-prone (self time is per-loop and inclusive of
|
|
72
|
+
children, so the slow node is rarely the obvious one). The good tools that do
|
|
73
|
+
this are web pastebins (your production plan leaves your machine) or commercial
|
|
74
|
+
SaaS. dataxplan is local, free, programmatic and embeddable: run it in a script,
|
|
75
|
+
a notebook, your CI, or later an MCP server, and keep the plan in your own
|
|
76
|
+
environment.
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install dataxplan
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
No runtime dependencies. The chart is optional (`pip install "dataxplan[viz]"`).
|
|
83
|
+
|
|
84
|
+
## Quick start
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import dataxplan
|
|
88
|
+
|
|
89
|
+
report = dataxplan.analyze(explain_json) # the EXPLAIN (FORMAT JSON) output
|
|
90
|
+
print(report.summary()) # the summary shown above
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### From the command line
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
dataxplan plan.json # summary
|
|
97
|
+
dataxplan plan.json --tree # also the annotated plan tree
|
|
98
|
+
dataxplan plan.json --json # the full report as JSON
|
|
99
|
+
dataxplan before.json --compare after.json
|
|
100
|
+
psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) <query>" | dataxplan
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Guard a plan in CI
|
|
104
|
+
|
|
105
|
+
Pin a critical query's plan in your test suite, so a code or schema change that
|
|
106
|
+
makes it regress fails the build. Nothing else in Python does this.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
def test_orders_lookup_stays_fast():
|
|
110
|
+
report = dataxplan.analyze(get_explain("SELECT * FROM orders WHERE customer_id = %s"))
|
|
111
|
+
assert not report.has_seq_scan_on("orders")
|
|
112
|
+
assert report.max_estimation_error < 100
|
|
113
|
+
assert not report.spilled_to_disk
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Compare two plans (before / after an index)
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
print(dataxplan.compare(before_json, after_json).summary())
|
|
120
|
+
# dataxplan compare - IMPROVED
|
|
121
|
+
# execution time 905.00 ms -> 0.08 ms (-100%)
|
|
122
|
+
# resolved filter_discard, seq_scan_hot
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Sharper findings with catalog context (optional)
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from dataxplan import Context, TableInfo
|
|
129
|
+
ctx = Context(tables={"orders": TableInfo("orders", row_count=10_000_000,
|
|
130
|
+
indexed_columns=("id",))})
|
|
131
|
+
dataxplan.analyze(explain_json, context=ctx)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Fetch a plan from a connection you already have (optional)
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
plan = dataxplan.run_explain(conn, "SELECT * FROM orders WHERE id = %s", params=(42,))
|
|
138
|
+
dataxplan.analyze(plan)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
`run_explain` calls `cursor.execute` on a DB-API connection you pass (psycopg,
|
|
142
|
+
psycopg2, ...); dataxplan does not depend on any driver. With `analyze=True` it
|
|
143
|
+
runs the query, so use `analyze=False` for a plan-only estimate.
|
|
144
|
+
|
|
145
|
+
## What it covers
|
|
146
|
+
|
|
147
|
+
| Area | What you get |
|
|
148
|
+
| --- | --- |
|
|
149
|
+
| Parse | `parse` -> a typed `Plan` / `PlanNode` tree from EXPLAIN (FORMAT JSON) |
|
|
150
|
+
| Metrics | self (exclusive) time, % of total, estimation error, disk spills, buffers |
|
|
151
|
+
| Findings | hot sequential scans, large row mis-estimates, disk spills, filter discards, nested-loop blow-ups, index-only heap fetches, lossy bitmaps, JIT overhead |
|
|
152
|
+
| Report | `summary`, `to_dict`, and an assertion API (`has_seq_scan_on`, `max_estimation_error`, `spilled_to_disk`, `ok`) for CI |
|
|
153
|
+
| Compare | `compare` two plans for regression (timing, shape, estimates, findings) |
|
|
154
|
+
| Render | `text_tree` (annotated, dependency-free) and `plan_tree_chart` (needs matplotlib) |
|
|
155
|
+
| Context | optional catalog metadata (sizes, indexes, stale stats) that sharpens findings |
|
|
156
|
+
| CLI | `dataxplan plan.json` (or stdin): summary, `--tree`, `--json`, `--compare` |
|
|
157
|
+
|
|
158
|
+
## Examples
|
|
159
|
+
|
|
160
|
+
Four plans from public datasets and benchmarks, each showing a different problem,
|
|
161
|
+
are in [`examples/`](examples/): the IMDB / Join Order Benchmark (a row
|
|
162
|
+
mis-estimate), the NYC TLC taxi trips (a sort that spills to disk), TPC-H
|
|
163
|
+
`lineitem` (a hot scan discarding most rows), and the Bosch Production Line
|
|
164
|
+
Performance manufacturing data set (a hash join with a large mis-estimate). For
|
|
165
|
+
instance:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
dataxplan examples/job_imdb_misestimate.json
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## What is out of scope
|
|
172
|
+
|
|
173
|
+
dataxplan analyses the **plan you give it**. By default it does not connect to a
|
|
174
|
+
database, run your queries, or read your schema, so a finding is a **documented
|
|
175
|
+
heuristic, not a guarantee**, and the suggestions are based on the plan alone. It
|
|
176
|
+
does not rewrite SQL or invent a cost model. It targets PostgreSQL
|
|
177
|
+
`FORMAT JSON` output (MySQL may follow).
|
|
178
|
+
|
|
179
|
+
## How the headline metrics work
|
|
180
|
+
|
|
181
|
+
- **Self time.** Postgres reports `Actual Total Time` per loop and inclusive of
|
|
182
|
+
children, so a node's total is `Actual Total Time x Actual Loops`, and its
|
|
183
|
+
self time is that minus the children's totals. Self time is where the work
|
|
184
|
+
really happens.
|
|
185
|
+
- **Estimation error.** `Plan Rows` against `Actual Rows` (per loop); a large
|
|
186
|
+
ratio is the usual root cause of a bad plan.
|
|
187
|
+
|
|
188
|
+
## References and validation
|
|
189
|
+
|
|
190
|
+
The metric arithmetic is verified by hand against the semantics PostgreSQL
|
|
191
|
+
documents (see [`tests/`](tests/)), and the heuristics are grounded in primary
|
|
192
|
+
and academic sources, written in our own words:
|
|
193
|
+
|
|
194
|
+
- [PostgreSQL documentation: EXPLAIN](https://www.postgresql.org/docs/current/sql-explain.html)
|
|
195
|
+
- [PostgreSQL wiki: Using EXPLAIN](https://wiki.postgresql.org/wiki/Using_EXPLAIN)
|
|
196
|
+
- V. Leis, A. Gubichev, A. Mirchev, P. Boncz, A. Kemper, T. Neumann, "How Good
|
|
197
|
+
Are Query Optimizers, Really?", Proceedings of the VLDB Endowment 9(3), 2015 -
|
|
198
|
+
the study of cardinality mis-estimation behind the Join Order Benchmark, which
|
|
199
|
+
the `estimate_off` finding detects (see the JOB example in
|
|
200
|
+
[`examples/`](examples/)).
|
|
201
|
+
|
|
202
|
+
Plan analysis has no single numeric ground truth the way a closed-form formula
|
|
203
|
+
does, so the claim here is deliberately narrow: the parsing and arithmetic are
|
|
204
|
+
correct against the documented format, and each heuristic cites the behaviour it
|
|
205
|
+
relies on.
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
MIT. Written and maintained by [Atakan Arikan](https://github.com/arikanatakan),
|
|
210
|
+
MSc Student at Tsinghua University and Politecnico di Milano.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# dataxplan
|
|
2
|
+
|
|
3
|
+
[](https://github.com/arikanatakan/dataxplan/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/dataxplan/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
Read PostgreSQL `EXPLAIN` plans from Python: parse the plan, compute the numbers
|
|
8
|
+
people misread (self time and estimation error), flag documented problems,
|
|
9
|
+
compare plans, and guard them in CI. **No database connection, nothing leaves
|
|
10
|
+
your machine, deterministic output.**
|
|
11
|
+
|
|
12
|
+
You give it the output of `EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`; it does
|
|
13
|
+
the rest locally.
|
|
14
|
+
|
|
15
|
+

|
|
16
|
+
|
|
17
|
+
It turns a plan into a deterministic read (here a query whose join was estimated
|
|
18
|
+
at 5 rows but produced 500,000):
|
|
19
|
+
|
|
20
|
+
```text
|
|
21
|
+
dataxplan
|
|
22
|
+
execution time 1,505.00 ms (planning 0.30 ms)
|
|
23
|
+
nodes 3, depth 1
|
|
24
|
+
worst row estimate 100000x off
|
|
25
|
+
top by self time:
|
|
26
|
+
Index Scan on b 1,000.00 ms (66%)
|
|
27
|
+
Nested Loop 450.00 ms (30%)
|
|
28
|
+
findings:
|
|
29
|
+
[HIGH] Row estimate is far off (Nested Loop)
|
|
30
|
+
estimated 5 rows, actual 500,000 (100000x under-estimate)
|
|
31
|
+
-> run ANALYZE; if the columns are correlated consider extended statistics
|
|
32
|
+
[MEDIUM] Nested loop with many iterations (Nested Loop)
|
|
33
|
+
the inner side executed 500,000 times
|
|
34
|
+
-> usually an under-estimate upstream; a hash or merge join may be cheaper
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Why
|
|
38
|
+
|
|
39
|
+
Reading a plan by hand is error-prone (self time is per-loop and inclusive of
|
|
40
|
+
children, so the slow node is rarely the obvious one). The good tools that do
|
|
41
|
+
this are web pastebins (your production plan leaves your machine) or commercial
|
|
42
|
+
SaaS. dataxplan is local, free, programmatic and embeddable: run it in a script,
|
|
43
|
+
a notebook, your CI, or later an MCP server, and keep the plan in your own
|
|
44
|
+
environment.
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install dataxplan
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
No runtime dependencies. The chart is optional (`pip install "dataxplan[viz]"`).
|
|
51
|
+
|
|
52
|
+
## Quick start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import dataxplan
|
|
56
|
+
|
|
57
|
+
report = dataxplan.analyze(explain_json) # the EXPLAIN (FORMAT JSON) output
|
|
58
|
+
print(report.summary()) # the summary shown above
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### From the command line
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
dataxplan plan.json # summary
|
|
65
|
+
dataxplan plan.json --tree # also the annotated plan tree
|
|
66
|
+
dataxplan plan.json --json # the full report as JSON
|
|
67
|
+
dataxplan before.json --compare after.json
|
|
68
|
+
psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) <query>" | dataxplan
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Guard a plan in CI
|
|
72
|
+
|
|
73
|
+
Pin a critical query's plan in your test suite, so a code or schema change that
|
|
74
|
+
makes it regress fails the build. Nothing else in Python does this.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
def test_orders_lookup_stays_fast():
|
|
78
|
+
report = dataxplan.analyze(get_explain("SELECT * FROM orders WHERE customer_id = %s"))
|
|
79
|
+
assert not report.has_seq_scan_on("orders")
|
|
80
|
+
assert report.max_estimation_error < 100
|
|
81
|
+
assert not report.spilled_to_disk
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Compare two plans (before / after an index)
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
print(dataxplan.compare(before_json, after_json).summary())
|
|
88
|
+
# dataxplan compare - IMPROVED
|
|
89
|
+
# execution time 905.00 ms -> 0.08 ms (-100%)
|
|
90
|
+
# resolved filter_discard, seq_scan_hot
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Sharper findings with catalog context (optional)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from dataxplan import Context, TableInfo
|
|
97
|
+
ctx = Context(tables={"orders": TableInfo("orders", row_count=10_000_000,
|
|
98
|
+
indexed_columns=("id",))})
|
|
99
|
+
dataxplan.analyze(explain_json, context=ctx)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Fetch a plan from a connection you already have (optional)
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
plan = dataxplan.run_explain(conn, "SELECT * FROM orders WHERE id = %s", params=(42,))
|
|
106
|
+
dataxplan.analyze(plan)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
`run_explain` calls `cursor.execute` on a DB-API connection you pass (psycopg,
|
|
110
|
+
psycopg2, ...); dataxplan does not depend on any driver. With `analyze=True` it
|
|
111
|
+
runs the query, so use `analyze=False` for a plan-only estimate.
|
|
112
|
+
|
|
113
|
+
## What it covers
|
|
114
|
+
|
|
115
|
+
| Area | What you get |
|
|
116
|
+
| --- | --- |
|
|
117
|
+
| Parse | `parse` -> a typed `Plan` / `PlanNode` tree from EXPLAIN (FORMAT JSON) |
|
|
118
|
+
| Metrics | self (exclusive) time, % of total, estimation error, disk spills, buffers |
|
|
119
|
+
| Findings | hot sequential scans, large row mis-estimates, disk spills, filter discards, nested-loop blow-ups, index-only heap fetches, lossy bitmaps, JIT overhead |
|
|
120
|
+
| Report | `summary`, `to_dict`, and an assertion API (`has_seq_scan_on`, `max_estimation_error`, `spilled_to_disk`, `ok`) for CI |
|
|
121
|
+
| Compare | `compare` two plans for regression (timing, shape, estimates, findings) |
|
|
122
|
+
| Render | `text_tree` (annotated, dependency-free) and `plan_tree_chart` (needs matplotlib) |
|
|
123
|
+
| Context | optional catalog metadata (sizes, indexes, stale stats) that sharpens findings |
|
|
124
|
+
| CLI | `dataxplan plan.json` (or stdin): summary, `--tree`, `--json`, `--compare` |
|
|
125
|
+
|
|
126
|
+
## Examples
|
|
127
|
+
|
|
128
|
+
Four plans from public datasets and benchmarks, each showing a different problem,
|
|
129
|
+
are in [`examples/`](examples/): the IMDB / Join Order Benchmark (a row
|
|
130
|
+
mis-estimate), the NYC TLC taxi trips (a sort that spills to disk), TPC-H
|
|
131
|
+
`lineitem` (a hot scan discarding most rows), and the Bosch Production Line
|
|
132
|
+
Performance manufacturing data set (a hash join with a large mis-estimate). For
|
|
133
|
+
instance:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
dataxplan examples/job_imdb_misestimate.json
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## What is out of scope
|
|
140
|
+
|
|
141
|
+
dataxplan analyses the **plan you give it**. By default it does not connect to a
|
|
142
|
+
database, run your queries, or read your schema, so a finding is a **documented
|
|
143
|
+
heuristic, not a guarantee**, and the suggestions are based on the plan alone. It
|
|
144
|
+
does not rewrite SQL or invent a cost model. It targets PostgreSQL
|
|
145
|
+
`FORMAT JSON` output (MySQL may follow).
|
|
146
|
+
|
|
147
|
+
## How the headline metrics work
|
|
148
|
+
|
|
149
|
+
- **Self time.** Postgres reports `Actual Total Time` per loop and inclusive of
|
|
150
|
+
children, so a node's total is `Actual Total Time x Actual Loops`, and its
|
|
151
|
+
self time is that minus the children's totals. Self time is where the work
|
|
152
|
+
really happens.
|
|
153
|
+
- **Estimation error.** `Plan Rows` against `Actual Rows` (per loop); a large
|
|
154
|
+
ratio is the usual root cause of a bad plan.
|
|
155
|
+
|
|
156
|
+
## References and validation
|
|
157
|
+
|
|
158
|
+
The metric arithmetic is verified by hand against the semantics PostgreSQL
|
|
159
|
+
documents (see [`tests/`](tests/)), and the heuristics are grounded in primary
|
|
160
|
+
and academic sources, written in our own words:
|
|
161
|
+
|
|
162
|
+
- [PostgreSQL documentation: EXPLAIN](https://www.postgresql.org/docs/current/sql-explain.html)
|
|
163
|
+
- [PostgreSQL wiki: Using EXPLAIN](https://wiki.postgresql.org/wiki/Using_EXPLAIN)
|
|
164
|
+
- V. Leis, A. Gubichev, A. Mirchev, P. Boncz, A. Kemper, T. Neumann, "How Good
|
|
165
|
+
Are Query Optimizers, Really?", Proceedings of the VLDB Endowment 9(3), 2015 -
|
|
166
|
+
the study of cardinality mis-estimation behind the Join Order Benchmark, which
|
|
167
|
+
the `estimate_off` finding detects (see the JOB example in
|
|
168
|
+
[`examples/`](examples/)).
|
|
169
|
+
|
|
170
|
+
Plan analysis has no single numeric ground truth the way a closed-form formula
|
|
171
|
+
does, so the claim here is deliberately narrow: the parsing and arithmetic are
|
|
172
|
+
correct against the documented format, and each heuristic cites the behaviour it
|
|
173
|
+
relies on.
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT. Written and maintained by [Atakan Arikan](https://github.com/arikanatakan),
|
|
178
|
+
MSc Student at Tsinghua University and Politecnico di Milano.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""dataxplan - read PostgreSQL EXPLAIN plans, locally and deterministically.
|
|
2
|
+
|
|
3
|
+
Give it the output of ``EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ...`` and it
|
|
4
|
+
parses the plan, computes the metrics people misread (self time, estimation
|
|
5
|
+
error, disk spills), and flags documented problems. No database connection is
|
|
6
|
+
required and nothing leaves your machine.
|
|
7
|
+
|
|
8
|
+
import dataxplan
|
|
9
|
+
|
|
10
|
+
report = dataxplan.analyze(explain_json)
|
|
11
|
+
print(report.summary())
|
|
12
|
+
|
|
13
|
+
# guard a plan in a test (fail CI if it regresses)
|
|
14
|
+
assert not report.has_seq_scan_on("orders")
|
|
15
|
+
assert report.max_estimation_error < 100
|
|
16
|
+
assert not report.spilled_to_disk
|
|
17
|
+
|
|
18
|
+
# compare two plans (before/after an index)
|
|
19
|
+
print(dataxplan.compare(before_json, after_json).summary())
|
|
20
|
+
|
|
21
|
+
The findings are documented heuristics, not guarantees, and the analysis is of
|
|
22
|
+
the plan you provide; it does not run your queries or read your schema unless you
|
|
23
|
+
choose to supply catalog context.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from ._result import Finding
|
|
27
|
+
from ._version import __version__
|
|
28
|
+
from .compare import Comparison, compare
|
|
29
|
+
from .context import Context, TableInfo
|
|
30
|
+
from .metrics import NodeMetrics
|
|
31
|
+
from .parse import Plan, PlanNode, parse
|
|
32
|
+
from .render import plan_tree_chart, text_tree
|
|
33
|
+
from .report import Report, analyze
|
|
34
|
+
from .run import run_explain
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# core flow
|
|
38
|
+
"parse", "analyze", "compare",
|
|
39
|
+
# types
|
|
40
|
+
"Plan", "PlanNode", "Report", "NodeMetrics", "Finding", "Comparison",
|
|
41
|
+
"Context", "TableInfo",
|
|
42
|
+
# render and helpers
|
|
43
|
+
"text_tree", "plan_tree_chart", "run_explain",
|
|
44
|
+
"__version__",
|
|
45
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shared plumbing: provenance (version, input hash, timestamp), the ``Finding``
|
|
2
|
+
type, and small formatting helpers. Every public result carries a meta block so
|
|
3
|
+
an analysis can be reproduced and audited later.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
|
|
13
|
+
from ._version import __version__
|
|
14
|
+
|
|
15
|
+
SCHEMA = 1
|
|
16
|
+
|
|
17
|
+
# Finding severities, most to least serious.
|
|
18
|
+
HIGH = "high"
|
|
19
|
+
MEDIUM = "medium"
|
|
20
|
+
LOW = "low"
|
|
21
|
+
INFO = "info"
|
|
22
|
+
_ORDER = {HIGH: 0, MEDIUM: 1, LOW: 2, INFO: 3}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def utcnow() -> str:
|
|
26
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def data_hash(obj: object) -> str:
|
|
30
|
+
payload = json.dumps(obj, sort_keys=True, default=str).encode("utf-8")
|
|
31
|
+
return "sha256:" + hashlib.sha256(payload).hexdigest()[:16]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def make_meta(inputs: dict) -> dict:
|
|
35
|
+
"""The provenance block stamped onto every result."""
|
|
36
|
+
return {
|
|
37
|
+
"library": "dataxplan",
|
|
38
|
+
"version": __version__,
|
|
39
|
+
"computed_at": utcnow(),
|
|
40
|
+
"input_hash": data_hash(inputs),
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class Finding:
|
|
46
|
+
"""One observation about a plan: a documented heuristic, not a guarantee."""
|
|
47
|
+
|
|
48
|
+
id: str
|
|
49
|
+
severity: str # high | medium | low | info
|
|
50
|
+
title: str
|
|
51
|
+
detail: str
|
|
52
|
+
node: str | None = None # e.g. "Seq Scan on orders"
|
|
53
|
+
path: tuple[int, ...] | None = None
|
|
54
|
+
suggestion: str | None = None
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def rank(self) -> int:
|
|
58
|
+
return _ORDER.get(self.severity, 9)
|
|
59
|
+
|
|
60
|
+
def __str__(self) -> str:
|
|
61
|
+
head = f"[{self.severity.upper()}] {self.title}"
|
|
62
|
+
if self.node:
|
|
63
|
+
head += f" ({self.node})"
|
|
64
|
+
lines = [head, f" {self.detail}"]
|
|
65
|
+
if self.suggestion:
|
|
66
|
+
lines.append(f" -> {self.suggestion}")
|
|
67
|
+
return "\n".join(lines)
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> dict:
|
|
70
|
+
return {
|
|
71
|
+
"id": self.id, "severity": self.severity, "title": self.title,
|
|
72
|
+
"detail": self.detail, "node": self.node,
|
|
73
|
+
"path": list(self.path) if self.path else None,
|
|
74
|
+
"suggestion": self.suggestion,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def ms(value: float | None) -> str:
|
|
79
|
+
return "-" if value is None else f"{value:,.2f} ms"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def num(value: float | None, places: int = 0) -> str:
|
|
83
|
+
return "-" if value is None else f"{value:,.{places}f}"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Command-line interface: analyse a plan from a file or stdin.
|
|
2
|
+
|
|
3
|
+
dataxplan plan.json
|
|
4
|
+
dataxplan plan.json --tree
|
|
5
|
+
dataxplan plan.json --json
|
|
6
|
+
dataxplan before.json --compare after.json
|
|
7
|
+
psql -XqAt -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) ..." | dataxplan
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
from . import analyze, compare, text_tree
|
|
17
|
+
from ._version import __version__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read(source: str) -> str:
|
|
21
|
+
if source in (None, "-"):
|
|
22
|
+
return sys.stdin.read()
|
|
23
|
+
with open(source, encoding="utf-8") as handle:
|
|
24
|
+
return handle.read()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def main(argv=None) -> int:
|
|
28
|
+
parser = argparse.ArgumentParser(
|
|
29
|
+
prog="dataxplan",
|
|
30
|
+
description="Analyse a PostgreSQL EXPLAIN (FORMAT JSON) plan, locally.")
|
|
31
|
+
parser.add_argument("plan", nargs="?", default="-",
|
|
32
|
+
help="plan file, or - for stdin (the default)")
|
|
33
|
+
parser.add_argument("--tree", action="store_true",
|
|
34
|
+
help="also print the annotated plan tree")
|
|
35
|
+
parser.add_argument("--json", action="store_true",
|
|
36
|
+
help="print the full report (or comparison) as JSON")
|
|
37
|
+
parser.add_argument("--compare", metavar="OTHER",
|
|
38
|
+
help="compare the plan against another plan file")
|
|
39
|
+
parser.add_argument("--version", action="version",
|
|
40
|
+
version=f"dataxplan {__version__}")
|
|
41
|
+
args = parser.parse_args(argv)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
plan = json.loads(_read(args.plan))
|
|
45
|
+
if args.compare:
|
|
46
|
+
result = compare(plan, json.loads(_read(args.compare)))
|
|
47
|
+
print(json.dumps(result.to_dict(), indent=2, default=str)
|
|
48
|
+
if args.json else result.summary())
|
|
49
|
+
return 0
|
|
50
|
+
report = analyze(plan)
|
|
51
|
+
except (ValueError, TypeError, OSError, json.JSONDecodeError) as exc:
|
|
52
|
+
print(f"dataxplan: {exc}", file=sys.stderr)
|
|
53
|
+
return 2
|
|
54
|
+
|
|
55
|
+
if args.json:
|
|
56
|
+
print(json.dumps(report.to_dict(), indent=2, default=str))
|
|
57
|
+
else:
|
|
58
|
+
print(report.summary())
|
|
59
|
+
if args.tree:
|
|
60
|
+
print("\nplan tree:")
|
|
61
|
+
print(text_tree(report))
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
sys.exit(main())
|