sql-decomposer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sql_decomposer-0.1.0/LICENSE +21 -0
- sql_decomposer-0.1.0/PKG-INFO +116 -0
- sql_decomposer-0.1.0/README.md +91 -0
- sql_decomposer-0.1.0/pyproject.toml +46 -0
- sql_decomposer-0.1.0/setup.cfg +4 -0
- sql_decomposer-0.1.0/sql_decomposer/__init__.py +4 -0
- sql_decomposer-0.1.0/sql_decomposer/__main__.py +56 -0
- sql_decomposer-0.1.0/sql_decomposer/decomposer.py +156 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/PKG-INFO +116 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/SOURCES.txt +13 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/dependency_links.txt +1 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/entry_points.txt +2 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/requires.txt +6 -0
- sql_decomposer-0.1.0/sql_decomposer.egg-info/top_level.txt +1 -0
- sql_decomposer-0.1.0/tests/test_package.py +82 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 sql_decomposer contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sql-decomposer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract repeated SQL subqueries into temporary tables using sqlglot.
|
|
5
|
+
Author: sql_decomposer contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: sql,sqlglot,decomposer,query-optimization
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Database
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: sqlglot>=23.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
23
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# sql_decomposer
|
|
27
|
+
|
|
28
|
+
`sql_decomposer` extracts repeated SQL subqueries into temporary tables.
|
|
29
|
+
It can help reduce duplication in large analytical queries and produce a more readable SQL script.
|
|
30
|
+
|
|
31
|
+
## Features
|
|
32
|
+
|
|
33
|
+
- Parses SQL safely with `sqlglot`.
|
|
34
|
+
- Detects repeated `SELECT` subqueries.
|
|
35
|
+
- Rewrites repeated blocks to `SELECT * FROM <temp_table>`.
|
|
36
|
+
- Provides a Python API and a CLI.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
From source:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install .
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
From PyPI (after release):
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install sql-decomposer
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## CLI usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python -m sql_decomposer input.sql output.sql --min-count 2 --temp-prefix __temp
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Or with console script:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
sql-decomposer input.sql output.sql
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Options:
|
|
65
|
+
|
|
66
|
+
- `--dialect`: optional sqlglot dialect (`postgres`, `mysql`, etc.)
|
|
67
|
+
- `--min-count`: minimum repetition count to extract (default: `2`)
|
|
68
|
+
- `--temp-prefix`: generated temp table prefix (default: `__temp`)
|
|
69
|
+
|
|
70
|
+
## Python API
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from sql_decomposer import decompose_sql
|
|
74
|
+
|
|
75
|
+
sql = "SELECT * FROM (SELECT id FROM users) t1 JOIN (SELECT id FROM users) t2 ON t1.id=t2.id"
|
|
76
|
+
result = decompose_sql(sql, min_count=2, temp_prefix="tmp")
|
|
77
|
+
print(result)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Development
|
|
81
|
+
|
|
82
|
+
Install dev dependencies:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install -e ".[dev]"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Run tests:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pytest
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Build artifacts:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
python -m build
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Validate package metadata:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
twine check dist/*
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## GitHub and PyPI release checklist
|
|
107
|
+
|
|
108
|
+
1. Create repository named `sql_decomposer` on GitHub.
|
|
109
|
+
2. Push code and enable Actions.
|
|
110
|
+
3. Create a PyPI project `sql-decomposer`.
|
|
111
|
+
4. Add `PYPI_API_TOKEN` as a GitHub Actions secret.
|
|
112
|
+
5. Tag a release (`v0.1.0`) to trigger publish workflow.
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# sql_decomposer
|
|
2
|
+
|
|
3
|
+
`sql_decomposer` extracts repeated SQL subqueries into temporary tables.
|
|
4
|
+
It can help reduce duplication in large analytical queries and produce a more readable SQL script.
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
- Parses SQL safely with `sqlglot`.
|
|
9
|
+
- Detects repeated `SELECT` subqueries.
|
|
10
|
+
- Rewrites repeated blocks to `SELECT * FROM <temp_table>`.
|
|
11
|
+
- Provides a Python API and a CLI.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
From source:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
From PyPI (after release):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install sql-decomposer
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## CLI usage
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
python -m sql_decomposer input.sql output.sql --min-count 2 --temp-prefix __temp
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Or with console script:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
sql-decomposer input.sql output.sql
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Options:
|
|
40
|
+
|
|
41
|
+
- `--dialect`: optional sqlglot dialect (`postgres`, `mysql`, etc.)
|
|
42
|
+
- `--min-count`: minimum repetition count to extract (default: `2`)
|
|
43
|
+
- `--temp-prefix`: generated temp table prefix (default: `__temp`)
|
|
44
|
+
|
|
45
|
+
## Python API
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from sql_decomposer import decompose_sql
|
|
49
|
+
|
|
50
|
+
sql = "SELECT * FROM (SELECT id FROM users) t1 JOIN (SELECT id FROM users) t2 ON t1.id=t2.id"
|
|
51
|
+
result = decompose_sql(sql, min_count=2, temp_prefix="tmp")
|
|
52
|
+
print(result)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Development
|
|
56
|
+
|
|
57
|
+
Install dev dependencies:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install -e ".[dev]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Run tests:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pytest
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Build artifacts:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
python -m build
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Validate package metadata:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
twine check dist/*
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## GitHub and PyPI release checklist
|
|
82
|
+
|
|
83
|
+
1. Create repository named `sql_decomposer` on GitHub.
|
|
84
|
+
2. Push code and enable Actions.
|
|
85
|
+
3. Create a PyPI project `sql-decomposer`.
|
|
86
|
+
4. Add `PYPI_API_TOKEN` as a GitHub Actions secret.
|
|
87
|
+
5. Tag a release (`v0.1.0`) to trigger publish workflow.
|
|
88
|
+
|
|
89
|
+
## License
|
|
90
|
+
|
|
91
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sql-decomposer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Extract repeated SQL subqueries into temporary tables using sqlglot."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "sql_decomposer contributors" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["sql", "sqlglot", "decomposer", "query-optimization"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Database",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"sqlglot>=23.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = [
|
|
32
|
+
"build>=1.2.2",
|
|
33
|
+
"pytest>=8.0",
|
|
34
|
+
"twine>=5.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
sql-decomposer = "sql_decomposer.__main__:main"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["."]
|
|
42
|
+
include = ["sql_decomposer*"]
|
|
43
|
+
|
|
44
|
+
[tool.pytest.ini_options]
|
|
45
|
+
testpaths = ["tests"]
|
|
46
|
+
python_files = ["test_package.py"]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from sql_decomposer.decomposer import decompose_sql
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
8
|
+
parser = argparse.ArgumentParser(
|
|
9
|
+
prog="sql_decomposer",
|
|
10
|
+
description="Decompose repeated SQL subqueries and save result to file.",
|
|
11
|
+
)
|
|
12
|
+
parser.add_argument("input", type=Path, help="Path to input .sql file")
|
|
13
|
+
parser.add_argument("output", type=Path, help="Path to output .sql file")
|
|
14
|
+
parser.add_argument(
|
|
15
|
+
"--dialect",
|
|
16
|
+
default=None,
|
|
17
|
+
help='Optional sqlglot dialect (e.g. "postgres", "mysql")',
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--min-count",
|
|
21
|
+
type=int,
|
|
22
|
+
default=2,
|
|
23
|
+
help="Minimum repetition count for extraction (default: 2)",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--temp-prefix",
|
|
27
|
+
default="__temp",
|
|
28
|
+
help='Prefix for generated temp tables (default: "__temp")',
|
|
29
|
+
)
|
|
30
|
+
return parser
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main() -> int:
|
|
34
|
+
parser = build_parser()
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
if not args.input.exists():
|
|
38
|
+
parser.error(f"Input file does not exist: {args.input}")
|
|
39
|
+
if not args.input.is_file():
|
|
40
|
+
parser.error(f"Input path is not a file: {args.input}")
|
|
41
|
+
|
|
42
|
+
sql_text = args.input.read_text(encoding="utf-8")
|
|
43
|
+
decomposed = decompose_sql(
|
|
44
|
+
sql=sql_text,
|
|
45
|
+
dialect=args.dialect,
|
|
46
|
+
min_count=args.min_count,
|
|
47
|
+
temp_prefix=args.temp_prefix,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
args.output.write_text(decomposed, encoding="utf-8")
|
|
52
|
+
return 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQL Decomposer — extracts repeated subqueries from a SQL statement
|
|
3
|
+
into temporary tables, reducing duplication.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sqlglot
|
|
7
|
+
from sqlglot import exp
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _collect_sql_frequencies(
|
|
12
|
+
ast: exp.Expression, dialect: Optional[str]
|
|
13
|
+
) -> tuple[dict[str, int], set[str]]:
|
|
14
|
+
"""Collect SQL string frequencies and SELECT-only SQL strings."""
|
|
15
|
+
sql_counter: dict[str, int] = {}
|
|
16
|
+
select_sqls: set[str] = set()
|
|
17
|
+
|
|
18
|
+
for walked in ast.walk():
|
|
19
|
+
node = walked[0] if isinstance(walked, tuple) else walked
|
|
20
|
+
node_sql = node.sql(dialect=dialect)
|
|
21
|
+
if not node_sql:
|
|
22
|
+
continue
|
|
23
|
+
|
|
24
|
+
sql_counter[node_sql] = sql_counter.get(node_sql, 0) + 1
|
|
25
|
+
if isinstance(node, exp.Select):
|
|
26
|
+
select_sqls.add(node_sql)
|
|
27
|
+
|
|
28
|
+
return sql_counter, select_sqls
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _build_temp_table_map(
|
|
32
|
+
sql_counter: dict[str, int],
|
|
33
|
+
select_sqls: set[str],
|
|
34
|
+
root_sql: str,
|
|
35
|
+
min_count: int,
|
|
36
|
+
temp_prefix: str,
|
|
37
|
+
) -> dict[str, str]:
|
|
38
|
+
"""
|
|
39
|
+
Build stable mapping: repeated SELECT SQL -> temp table name.
|
|
40
|
+
|
|
41
|
+
Ordering is deterministic:
|
|
42
|
+
1) higher repetition count first
|
|
43
|
+
2) longer SQL first (helps nested/repeated patterns)
|
|
44
|
+
"""
|
|
45
|
+
sorted_items = sorted(
|
|
46
|
+
sql_counter.items(),
|
|
47
|
+
key=lambda item: (item[1], len(item[0])),
|
|
48
|
+
reverse=True,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
ordered_candidates = [
|
|
52
|
+
sql_key
|
|
53
|
+
for sql_key, count in sorted_items
|
|
54
|
+
if count >= min_count and sql_key != root_sql and sql_key in select_sqls
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
temp_table_map: dict[str, str] = {}
|
|
58
|
+
working_sql = root_sql
|
|
59
|
+
|
|
60
|
+
# Keep legacy "longer-first replacement" semantics for candidate filtering:
|
|
61
|
+
# once a larger repeated SELECT is selected, nested candidates it contains
|
|
62
|
+
# may disappear and therefore should not be extracted separately.
|
|
63
|
+
for sql_key in ordered_candidates:
|
|
64
|
+
if sql_key not in working_sql:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
temp_name = f"{temp_prefix}_{len(temp_table_map) + 1}"
|
|
68
|
+
temp_table_map[sql_key] = temp_name
|
|
69
|
+
working_sql = working_sql.replace(sql_key, f"SELECT * FROM {temp_name}")
|
|
70
|
+
|
|
71
|
+
return temp_table_map
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _rewrite_selects_with_temp_tables(
|
|
75
|
+
ast: exp.Expression, dialect: Optional[str], temp_table_map: dict[str, str]
|
|
76
|
+
) -> exp.Expression:
|
|
77
|
+
"""Replace repeated SELECT nodes with SELECT * FROM <temp_table>."""
|
|
78
|
+
if not temp_table_map:
|
|
79
|
+
return ast
|
|
80
|
+
|
|
81
|
+
def _replace(node: exp.Expression) -> exp.Expression:
|
|
82
|
+
if isinstance(node, exp.Select):
|
|
83
|
+
node_sql = node.sql(dialect=dialect)
|
|
84
|
+
temp_name = temp_table_map.get(node_sql)
|
|
85
|
+
if temp_name:
|
|
86
|
+
return exp.select("*").from_(temp_name)
|
|
87
|
+
return node
|
|
88
|
+
|
|
89
|
+
return ast.transform(_replace)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def decompose_sql(
|
|
93
|
+
sql: str,
|
|
94
|
+
dialect: Optional[str] = None,
|
|
95
|
+
min_count: int = 2,
|
|
96
|
+
temp_prefix: str = "__temp",
|
|
97
|
+
) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Decompose a SQL query by extracting repeated sub-SELECT expressions
|
|
100
|
+
into CREATE TEMPORARY TABLE statements.
|
|
101
|
+
|
|
102
|
+
Algorithm:
|
|
103
|
+
1. Parse *sql* into an AST via sqlglot.
|
|
104
|
+
2. Walk every node; convert each node back to a SQL string and
|
|
105
|
+
count occurrences in a dictionary.
|
|
106
|
+
3. Regenerate the full SQL from the AST.
|
|
107
|
+
4. Sort the dictionary by count DESC, then by SQL length DESC
|
|
108
|
+
(longer expressions first — avoids partial-substring collisions).
|
|
109
|
+
5-6. For each entry that qualifies (is a SELECT, appears ≥ *min_count*
|
|
110
|
+
times, is not the root query), create a temp-table definition and
|
|
111
|
+
replace the subquery text in the working SQL string.
|
|
112
|
+
7. Append the (possibly rewritten) final SQL.
|
|
113
|
+
8. Return the ordered list of statements.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
sql: Source SQL string.
|
|
117
|
+
dialect: sqlglot dialect name (e.g. "postgres", "mysql").
|
|
118
|
+
min_count: Minimum number of occurrences for extraction (default 2).
|
|
119
|
+
temp_prefix: Naming prefix for generated temp tables.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
SQL script as a single string: zero or more CREATE TEMPORARY TABLE
|
|
123
|
+
statements followed by the final (rewritten) query.
|
|
124
|
+
"""
|
|
125
|
+
# ── Step 1: parse ────────────────────────────────────────────────
|
|
126
|
+
ast = sqlglot.parse_one(sql, dialect=dialect)
|
|
127
|
+
|
|
128
|
+
# ── Step 2: walk all nodes, count SQL representations ────────────
|
|
129
|
+
sql_counter, select_sqls = _collect_sql_frequencies(ast, dialect)
|
|
130
|
+
|
|
131
|
+
# ── Step 3: full SQL from AST ────────────────────────────────────
|
|
132
|
+
result_sql = ast.sql(dialect=dialect)
|
|
133
|
+
|
|
134
|
+
# ── Step 4: build SELECT->temp table mapping ─────────────────────
|
|
135
|
+
temp_table_map = _build_temp_table_map(
|
|
136
|
+
sql_counter=sql_counter,
|
|
137
|
+
select_sqls=select_sqls,
|
|
138
|
+
root_sql=result_sql,
|
|
139
|
+
min_count=min_count,
|
|
140
|
+
temp_prefix=temp_prefix,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# ── Steps 5-6: create temp tables and rewrite AST ────────────────
|
|
144
|
+
statements: list[str] = []
|
|
145
|
+
|
|
146
|
+
for sql_key, temp_name in temp_table_map.items():
|
|
147
|
+
statements.append(f"CREATE TEMPORARY TABLE {temp_name} AS {sql_key}")
|
|
148
|
+
|
|
149
|
+
rewritten_ast = _rewrite_selects_with_temp_tables(ast, dialect, temp_table_map)
|
|
150
|
+
result_sql = rewritten_ast.sql(dialect=dialect)
|
|
151
|
+
|
|
152
|
+
# ── Step 7: append final rewritten query ─────────────────────────
|
|
153
|
+
statements.append(result_sql)
|
|
154
|
+
|
|
155
|
+
# ── Step 8 ───────────────────────────────────────────────────────
|
|
156
|
+
return "\n".join(statements)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sql-decomposer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract repeated SQL subqueries into temporary tables using sqlglot.
|
|
5
|
+
Author: sql_decomposer contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: sql,sqlglot,decomposer,query-optimization
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Database
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: sqlglot>=23.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
23
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# sql_decomposer
|
|
27
|
+
|
|
28
|
+
`sql_decomposer` extracts repeated SQL subqueries into temporary tables.
|
|
29
|
+
It can help reduce duplication in large analytical queries and produce a more readable SQL script.
|
|
30
|
+
|
|
31
|
+
## Features
|
|
32
|
+
|
|
33
|
+
- Parses SQL safely with `sqlglot`.
|
|
34
|
+
- Detects repeated `SELECT` subqueries.
|
|
35
|
+
- Rewrites repeated blocks to `SELECT * FROM <temp_table>`.
|
|
36
|
+
- Provides a Python API and a CLI.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
From source:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install .
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
From PyPI (after release):
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install sql-decomposer
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## CLI usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python -m sql_decomposer input.sql output.sql --min-count 2 --temp-prefix __temp
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Or with console script:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
sql-decomposer input.sql output.sql
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Options:
|
|
65
|
+
|
|
66
|
+
- `--dialect`: optional sqlglot dialect (`postgres`, `mysql`, etc.)
|
|
67
|
+
- `--min-count`: minimum repetition count to extract (default: `2`)
|
|
68
|
+
- `--temp-prefix`: generated temp table prefix (default: `__temp`)
|
|
69
|
+
|
|
70
|
+
## Python API
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from sql_decomposer import decompose_sql
|
|
74
|
+
|
|
75
|
+
sql = "SELECT * FROM (SELECT id FROM users) t1 JOIN (SELECT id FROM users) t2 ON t1.id=t2.id"
|
|
76
|
+
result = decompose_sql(sql, min_count=2, temp_prefix="tmp")
|
|
77
|
+
print(result)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Development
|
|
81
|
+
|
|
82
|
+
Install dev dependencies:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install -e ".[dev]"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Run tests:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pytest
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Build artifacts:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
python -m build
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Validate package metadata:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
twine check dist/*
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## GitHub and PyPI release checklist
|
|
107
|
+
|
|
108
|
+
1. Create repository named `sql_decomposer` on GitHub.
|
|
109
|
+
2. Push code and enable Actions.
|
|
110
|
+
3. Create a PyPI project `sql-decomposer`.
|
|
111
|
+
4. Add `PYPI_API_TOKEN` as a GitHub Actions secret.
|
|
112
|
+
5. Tag a release (`v0.1.0`) to trigger publish workflow.
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
sql_decomposer/__init__.py
|
|
5
|
+
sql_decomposer/__main__.py
|
|
6
|
+
sql_decomposer/decomposer.py
|
|
7
|
+
sql_decomposer.egg-info/PKG-INFO
|
|
8
|
+
sql_decomposer.egg-info/SOURCES.txt
|
|
9
|
+
sql_decomposer.egg-info/dependency_links.txt
|
|
10
|
+
sql_decomposer.egg-info/entry_points.txt
|
|
11
|
+
sql_decomposer.egg-info/requires.txt
|
|
12
|
+
sql_decomposer.egg-info/top_level.txt
|
|
13
|
+
tests/test_package.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sql_decomposer
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from sql_decomposer import decompose_sql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _read(rel_path: str) -> str:
|
|
12
|
+
return (PROJECT_ROOT / rel_path).read_text(encoding="utf-8").strip()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.mark.parametrize(
|
|
16
|
+
("input_fixture", "expected_fixture"),
|
|
17
|
+
[
|
|
18
|
+
(
|
|
19
|
+
"tests/sql_big_analytics_query.sql",
|
|
20
|
+
"decomposed_sql/sql_big_analytics_query.sql",
|
|
21
|
+
),
|
|
22
|
+
(
|
|
23
|
+
"tests/sql_complex_hr_query.sql",
|
|
24
|
+
"decomposed_sql/sql_complex_hr_query.sql",
|
|
25
|
+
),
|
|
26
|
+
(
|
|
27
|
+
"tests/sql_financial_report_four_repeats.sql",
|
|
28
|
+
"decomposed_sql/sql_financial_report_four_repeats.sql",
|
|
29
|
+
),
|
|
30
|
+
(
|
|
31
|
+
"tests/sql_flat_query_no_subqueries.sql",
|
|
32
|
+
"decomposed_sql/sql_flat_query_no_subqueries.sql",
|
|
33
|
+
),
|
|
34
|
+
("tests/sql_min_count_one.sql", "decomposed_sql/sql_min_count_one.sql"),
|
|
35
|
+
(
|
|
36
|
+
"tests/sql_nested_repeated_subqueries.sql",
|
|
37
|
+
"decomposed_sql/sql_nested_repeated_subqueries.sql",
|
|
38
|
+
),
|
|
39
|
+
("tests/sql_no_repetition.sql", "decomposed_sql/sql_no_repetition.sql"),
|
|
40
|
+
(
|
|
41
|
+
"tests/sql_simple_two_identical_subqueries.sql",
|
|
42
|
+
"decomposed_sql/sql_simple_two_identical_subqueries.sql",
|
|
43
|
+
),
|
|
44
|
+
(
|
|
45
|
+
"tests/sql_triple_repeated_subquery.sql",
|
|
46
|
+
"decomposed_sql/sql_triple_repeated_subquery.sql",
|
|
47
|
+
),
|
|
48
|
+
(
|
|
49
|
+
"tests/sql_two_different_repeated_subqueries.sql",
|
|
50
|
+
"decomposed_sql/sql_two_different_repeated_subqueries.sql",
|
|
51
|
+
),
|
|
52
|
+
],
|
|
53
|
+
)
|
|
54
|
+
def test_decompose_sql_matches_expected_fixtures(
|
|
55
|
+
input_fixture: str, expected_fixture: str
|
|
56
|
+
) -> None:
|
|
57
|
+
sql = _read(input_fixture)
|
|
58
|
+
expected = _read(expected_fixture)
|
|
59
|
+
assert decompose_sql(sql) == expected
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_decompose_sql_does_not_replace_string_literals() -> None:
|
|
63
|
+
sql = """
|
|
64
|
+
SELECT
|
|
65
|
+
'SELECT id FROM users WHERE active = 1' AS query_text,
|
|
66
|
+
a.id
|
|
67
|
+
FROM (
|
|
68
|
+
SELECT id FROM users WHERE active = 1
|
|
69
|
+
) AS a
|
|
70
|
+
JOIN (
|
|
71
|
+
SELECT id FROM users WHERE active = 1
|
|
72
|
+
) AS b ON a.id = b.id
|
|
73
|
+
"""
|
|
74
|
+
result = decompose_sql(sql)
|
|
75
|
+
|
|
76
|
+
assert (
|
|
77
|
+
"CREATE TEMPORARY TABLE __temp_1 AS SELECT id FROM users WHERE active = 1"
|
|
78
|
+
in result
|
|
79
|
+
)
|
|
80
|
+
assert "'SELECT id FROM users WHERE active = 1' AS query_text" in result
|
|
81
|
+
assert "(SELECT * FROM __temp_1) AS a" in result
|
|
82
|
+
assert "(SELECT * FROM __temp_1) AS b" in result
|