tesserakit-sql 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ out/
8
+ .DS_Store
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: tesserakit-sql
3
+ Version: 0.4.0
4
+ Summary: SQL job pack for Tessera: lint SQL files/migrations into a statement and table catalog.
5
+ Project-URL: Homepage, https://github.com/ShaileshRawat1403/tessera
6
+ Project-URL: Repository, https://github.com/ShaileshRawat1403/tessera
7
+ Project-URL: Issues, https://github.com/ShaileshRawat1403/tessera/issues
8
+ Author: Shailesh Rawat
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: pydantic>=2.7
15
+ Requires-Dist: rich>=13.7
16
+ Requires-Dist: tesserakit-core>=0.1.0
17
+ Requires-Dist: typer>=0.12
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # tesserakit-sql
23
+
24
+ Lint SQL files and migrations into a statement and table catalog.
25
+
26
+ `tessera-sql` parses `.sql` files with lightweight heuristics (no database connection, no execution), builds a catalog of statements and declared tables, and flags high-signal migration-safety issues.
27
+
28
+ ## Lint SQL
29
+
30
+ ```bash
31
+ tessera sql lint --input migrations/ --output ./out/sql_pack
32
+ tessera sql lint --input schema.sql --output ./out/sql_pack
33
+ ```
34
+
35
+ Artifacts written:
36
+
37
+ ```text
38
+ statements.jsonl one SqlStatement per parsed statement (kind, target, flags)
39
+ tables.jsonl one SqlTable per CREATE TABLE (columns, primary-key flag)
40
+ index.md statement catalog
41
+ validation_report.md safety findings
42
+ coverage_report.md statement-kind distribution
43
+ tables.md table catalog with columns and PK status
44
+ ```
45
+
46
+ ## Lint rules
47
+
48
+ Query safety:
49
+
50
+ - `delete_without_where` (error) — `DELETE` with no `WHERE` removes every row
51
+ - `update_without_where` (warning) — `UPDATE` with no `WHERE` writes every row
52
+ - `select_star` (info) — `SELECT *` couples the query to column shape
53
+
54
+ Migration safety (the costly, easy-to-miss class):
55
+
56
+ - `add_not_null_without_default` (error) — `ALTER TABLE ... ADD COLUMN ... NOT NULL` with no `DEFAULT` rewrites the table and fails on existing rows
57
+ - `truncate_table` (warning) — `TRUNCATE` wipes all rows and is often non-transactional / irreversible
58
+ - `drop_column` (warning) — dropping a column is destructive and irreversible
59
+ - `rename_breaks_compatibility` (warning) — `RENAME` breaks code referencing the old name; prefer add-new + backfill + drop-old
60
+ - `drop_without_if_exists` (warning) — `DROP` without `IF EXISTS` fails if the object is absent
61
+ - `create_table_without_if_not_exists` (info) — non-idempotent if the migration re-runs
62
+
63
+ Schema:
64
+
65
+ - `table_without_primary_key` (warning) — a `CREATE TABLE` declares no `PRIMARY KEY`
66
+ - `no_statements` — nothing parsed
67
+
68
+ ## Limitations (v0.1)
69
+
70
+ Parsing is heuristic: comments are stripped, statements are split on top-level
71
+ semicolons (quote-aware), and classification is keyword/regex based. It is tuned
72
+ for migration and schema files, not for validating arbitrary vendor SQL dialects.
@@ -0,0 +1,51 @@
1
+ # tesserakit-sql
2
+
3
+ Lint SQL files and migrations into a statement and table catalog.
4
+
5
+ `tessera-sql` parses `.sql` files with lightweight heuristics (no database connection, no execution), builds a catalog of statements and declared tables, and flags high-signal migration-safety issues.
6
+
7
+ ## Lint SQL
8
+
9
+ ```bash
10
+ tessera sql lint --input migrations/ --output ./out/sql_pack
11
+ tessera sql lint --input schema.sql --output ./out/sql_pack
12
+ ```
13
+
14
+ Artifacts written:
15
+
16
+ ```text
17
+ statements.jsonl one SqlStatement per parsed statement (kind, target, flags)
18
+ tables.jsonl one SqlTable per CREATE TABLE (columns, primary-key flag)
19
+ index.md statement catalog
20
+ validation_report.md safety findings
21
+ coverage_report.md statement-kind distribution
22
+ tables.md table catalog with columns and PK status
23
+ ```
24
+
25
+ ## Lint rules
26
+
27
+ Query safety:
28
+
29
+ - `delete_without_where` (error) — `DELETE` with no `WHERE` removes every row
30
+ - `update_without_where` (warning) — `UPDATE` with no `WHERE` writes every row
31
+ - `select_star` (info) — `SELECT *` couples the query to column shape
32
+
33
+ Migration safety (the costly, easy-to-miss class):
34
+
35
+ - `add_not_null_without_default` (error) — `ALTER TABLE ... ADD COLUMN ... NOT NULL` with no `DEFAULT` rewrites the table and fails on existing rows
36
+ - `truncate_table` (warning) — `TRUNCATE` wipes all rows and is often non-transactional / irreversible
37
+ - `drop_column` (warning) — dropping a column is destructive and irreversible
38
+ - `rename_breaks_compatibility` (warning) — `RENAME` breaks code referencing the old name; prefer add-new + backfill + drop-old
39
+ - `drop_without_if_exists` (warning) — `DROP` without `IF EXISTS` fails if the object is absent
40
+ - `create_table_without_if_not_exists` (info) — non-idempotent if the migration re-runs
41
+
42
+ Schema:
43
+
44
+ - `table_without_primary_key` (warning) — a `CREATE TABLE` declares no `PRIMARY KEY`
45
+ - `no_statements` — nothing parsed
46
+
47
+ ## Limitations (v0.1)
48
+
49
+ Parsing is heuristic: comments are stripped, statements are split on top-level
50
+ semicolons (quote-aware), and classification is keyword/regex based. It is tuned
51
+ for migration and schema files, not for validating arbitrary vendor SQL dialects.
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "tesserakit-sql"
7
+ version = "0.4.0"
8
+ description = "SQL job pack for Tessera: lint SQL files/migrations into a statement and table catalog."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Shailesh Rawat" }]
12
+ dependencies = [
13
+ "tesserakit-core>=0.1.0",
14
+ "typer>=0.12",
15
+ "rich>=13.7",
16
+ "pydantic>=2.7",
17
+ ]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Environment :: Console",
21
+ "Intended Audience :: Developers",
22
+ "Programming Language :: Python :: 3",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/ShaileshRawat1403/tessera"
27
+ Repository = "https://github.com/ShaileshRawat1403/tessera"
28
+ Issues = "https://github.com/ShaileshRawat1403/tessera/issues"
29
+
30
+ [project.optional-dependencies]
31
+ dev = ["pytest>=8.0"]
32
+
33
+ [project.entry-points."tessera.commands"]
34
+ sql = "tessera_sql.cli:register"
35
+
36
+ [project.entry-points."tessera.jobpacks"]
37
+ sql = "tessera_sql.pack:create_pack"
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/tessera_sql"]
@@ -0,0 +1,3 @@
1
+ """Tessera sql pack."""
2
+
3
+ __version__ = "0.3.1"
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from tessera_core.models import RunContext
10
+
11
+ from tessera_sql.pack import SqlPack
12
+
13
+ console = Console()
14
+ sql_app = typer.Typer(help="Lint SQL files/migrations into a statement and table catalog.")
15
+
16
+
17
+ @sql_app.command("lint")
18
+ def lint_cmd(
19
+ input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="A .sql file or a directory of them."),
20
+ output: Path = typer.Option(Path("sql_pack"), "--output", "-o", help="Output directory."),
21
+ ) -> None:
22
+ """Parse and lint SQL; emit statement/table catalogs and findings."""
23
+ ctx = RunContext(job_name="sql", output_dir=output)
24
+ pack = SqlPack()
25
+ artifacts = pack.run(input_path=input, ctx=ctx, options={})
26
+
27
+ table = Table(title="SQL Pack Created")
28
+ table.add_column("Artifact")
29
+ table.add_column("Path")
30
+ table.add_column("Kind")
31
+ for art in artifacts:
32
+ table.add_row(art.name, str(art.path), art.kind)
33
+ console.print(table)
34
+
35
+ summary = Table(title="Run Summary")
36
+ summary.add_column("Metric")
37
+ summary.add_column("Value")
38
+ summary.add_row("run_id", ctx.run_id)
39
+ summary.add_row("statements", str(ctx.metadata.get("record_count", 0)))
40
+ summary.add_row("findings", str(ctx.metadata.get("finding_count", 0)))
41
+ console.print(summary)
42
+
43
+
44
+ def register(root_app: typer.Typer) -> None:
45
+ root_app.add_typer(sql_app, name="sql")
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from tessera_core.artifacts import write_jsonl, write_markdown
8
+ from tessera_core.models import Artifact, RunContext, ValidationFinding
9
+
10
+ from tessera_sql.loader import load_sql_records
11
+ from tessera_sql.schema import SqlStatement, SqlTable
12
+ from tessera_sql.validator import validate_sql_records
13
+
14
+
15
+ def load_records(input_path: Path, options: dict[str, Any]) -> list[SqlStatement]:
16
+ return load_sql_records(input_path, options)
17
+
18
+
19
+ def validate_records(statements: list[SqlStatement], options: dict[str, Any]) -> list[ValidationFinding]:
20
+ return validate_sql_records(statements, options)
21
+
22
+
23
+ def write_artifacts(statements: list[SqlStatement], ctx: RunContext, options: dict[str, Any]) -> list[Artifact]:
24
+ ctx.output_dir.mkdir(parents=True, exist_ok=True)
25
+ tables: list[SqlTable] = options.get("_tables", [])
26
+ findings: list[ValidationFinding] = ctx.metadata.get("findings") or validate_records(statements, options)
27
+
28
+ statements_jsonl = ctx.output_dir / "statements.jsonl"
29
+ tables_jsonl = ctx.output_dir / "tables.jsonl"
30
+ index_md = ctx.output_dir / "index.md"
31
+ validation_md = ctx.output_dir / "validation_report.md"
32
+ coverage_md = ctx.output_dir / "coverage_report.md"
33
+ tables_md = ctx.output_dir / "tables.md"
34
+
35
+ write_jsonl(statements_jsonl, [s.model_dump() for s in statements])
36
+ write_jsonl(tables_jsonl, [t.model_dump() for t in tables])
37
+ write_markdown(index_md, _render_index(statements, tables, options))
38
+ write_markdown(validation_md, _render_validation(statements, findings))
39
+ write_markdown(coverage_md, _render_coverage(statements))
40
+ write_markdown(tables_md, _render_tables(tables))
41
+
42
+ return [
43
+ Artifact(name="statements.jsonl", path=statements_jsonl, kind="jsonl"),
44
+ Artifact(name="tables.jsonl", path=tables_jsonl, kind="jsonl"),
45
+ Artifact(name="index.md", path=index_md, kind="markdown"),
46
+ Artifact(name="validation_report.md", path=validation_md, kind="markdown"),
47
+ Artifact(name="coverage_report.md", path=coverage_md, kind="markdown"),
48
+ Artifact(name="tables.md", path=tables_md, kind="markdown"),
49
+ ]
50
+
51
+
52
+ def _render_index(statements: list[SqlStatement], tables: list[SqlTable], options: dict[str, Any]) -> str:
53
+ lines = ["# SQL Catalog", ""]
54
+ lines.append(f"- Files: {options.get('_file_count', 0)}")
55
+ lines.append(f"- Statements: {len(statements)}")
56
+ lines.append(f"- Tables created: {len(tables)}")
57
+ lines.append("")
58
+ if not statements:
59
+ lines.append("_No statements found._")
60
+ return "\n".join(lines) + "\n"
61
+ lines.append("| Kind | Target | File:Line |")
62
+ lines.append("|---|---|---|")
63
+ for s in statements:
64
+ lines.append(f"| {s.kind} | {s.target or '-'} | `{s.file}:{s.lineno}` |")
65
+ return "\n".join(lines) + "\n"
66
+
67
+
68
+ def _render_validation(statements: list[SqlStatement], findings: list[ValidationFinding]) -> str:
69
+ lines = ["# Validation Report", ""]
70
+ lines.append(f"- Statements: {len(statements)}")
71
+ lines.append(f"- Findings: {len(findings)}")
72
+ lines.append("")
73
+ by_sev = Counter(f.severity for f in findings)
74
+ lines.append("## Severity Breakdown")
75
+ lines.append("")
76
+ for sev in ("error", "warning", "info"):
77
+ lines.append(f"- {sev}: {by_sev.get(sev, 0)}")
78
+ lines.append("")
79
+ if findings:
80
+ lines.append("## Findings")
81
+ lines.append("")
82
+ for f in findings[:200]:
83
+ lines.append(f"- **{f.severity.upper()}** `{f.code}`: {f.message}")
84
+ return "\n".join(lines)
85
+
86
+
87
+ def _render_coverage(statements: list[SqlStatement]) -> str:
88
+ lines = ["# Coverage Report", ""]
89
+ lines.append(f"- Statements: {len(statements)}")
90
+ if not statements:
91
+ return "\n".join(lines) + "\n"
92
+ kind_dist = Counter(s.kind for s in statements)
93
+ lines.append("")
94
+ lines.append("## Statement kinds")
95
+ lines.append("")
96
+ for kind, n in kind_dist.most_common():
97
+ lines.append(f"- `{kind}`: {n}")
98
+ return "\n".join(lines) + "\n"
99
+
100
+
101
+ def _render_tables(tables: list[SqlTable]) -> str:
102
+ lines = ["# Tables", ""]
103
+ lines.append(f"- Count: {len(tables)}")
104
+ lines.append("")
105
+ if not tables:
106
+ lines.append("_No CREATE TABLE statements found._")
107
+ return "\n".join(lines) + "\n"
108
+ for t in tables:
109
+ pk = "yes" if t.has_primary_key else "NO"
110
+ lines.append(f"## `{t.name}` (PK: {pk})")
111
+ lines.append("")
112
+ lines.append(f"- Source: `{t.file}:{t.lineno}`")
113
+ lines.append(f"- Columns ({len(t.columns)}): {', '.join(f'`{c}`' for c in t.columns) or '(none parsed)'}")
114
+ lines.append("")
115
+ return "\n".join(lines)
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from tessera_sql.parse import (
7
+ classify,
8
+ parse_create_table,
9
+ split_statements,
10
+ statement_flags,
11
+ )
12
+ from tessera_sql.schema import SqlStatement, SqlTable
13
+
14
+ _IGNORE = {
15
+ ".git", ".venv", "venv", "node_modules", "__pycache__", ".pytest_cache",
16
+ "dist", "build", ".tox", "target",
17
+ }
18
+
19
+
20
+ def discover_sql_files(root: Path) -> list[Path]:
21
+ if root.is_file():
22
+ return [root]
23
+ out: list[Path] = []
24
+ for p in sorted(root.rglob("*.sql")):
25
+ if any(part in _IGNORE for part in p.relative_to(root).parts):
26
+ continue
27
+ out.append(p)
28
+ return out
29
+
30
+
31
+ def load_sql_records(input_path: Path, options: dict[str, Any]) -> list[SqlStatement]:
32
+ """Parse SQL files into statements; stash discovered tables in options."""
33
+ root = input_path if input_path.is_dir() else input_path.parent
34
+ files = discover_sql_files(input_path if input_path.is_file() else root)
35
+
36
+ statements: list[SqlStatement] = []
37
+ tables: list[SqlTable] = []
38
+
39
+ for f in files:
40
+ try:
41
+ text = f.read_text(encoding="utf-8")
42
+ except (OSError, UnicodeDecodeError):
43
+ continue
44
+ rel = f.relative_to(root).as_posix() if f.is_relative_to(root) else f.name
45
+ for stmt_text, lineno in split_statements(text):
46
+ kind, target = classify(stmt_text)
47
+ flags = statement_flags(kind, stmt_text)
48
+ preview = " ".join(stmt_text.split())[:100]
49
+ statements.append(
50
+ SqlStatement(kind=kind, target=target, file=rel, lineno=lineno, preview=preview, flags=flags)
51
+ )
52
+ if kind == "create_table":
53
+ t = parse_create_table(stmt_text, target)
54
+ if t is not None:
55
+ t.file = rel
56
+ t.lineno = lineno
57
+ tables.append(t)
58
+
59
+ options["_tables"] = tables
60
+ options["_file_count"] = len(files)
61
+ options["_root"] = str(root)
62
+ return statements
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from tessera_core.jobpack import JobPack
7
+ from tessera_core.models import Artifact, RunContext, ValidationFinding
8
+
9
+ from tessera_sql.compiler import load_records, validate_records, write_artifacts
10
+
11
+
12
+ class SqlPack(JobPack):
13
+ name = "sql"
14
+ version = "0.3.1"
15
+
16
+ def normalize(self, input_path: Path, options: dict[str, Any]) -> list[Any]:
17
+ return load_records(input_path, options)
18
+
19
+ def validate(
20
+ self,
21
+ records: list[Any],
22
+ options: dict[str, Any],
23
+ ) -> list[ValidationFinding]:
24
+ return validate_records(records, options)
25
+
26
+ def generate(
27
+ self,
28
+ records: list[Any],
29
+ ctx: RunContext,
30
+ options: dict[str, Any],
31
+ ) -> list[Artifact]:
32
+ return write_artifacts(records, ctx, options)
33
+
34
+
35
+ def create_pack() -> SqlPack:
36
+ return SqlPack()
@@ -0,0 +1,156 @@
1
+ """Lightweight SQL parsing: strip comments, split statements, classify, extract.
2
+
3
+ Not a full SQL grammar. It strips comments, splits on top-level semicolons,
4
+ and uses keyword/regex heuristics to classify statements and pull out the
5
+ high-signal facts a migration reviewer cares about.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from tessera_sql.schema import SqlStatement, SqlTable
13
+
14
+ _LINE_COMMENT = re.compile(r"--[^\n]*")
15
+ _BLOCK_COMMENT = re.compile(r"/\*.*?\*/", re.DOTALL)
16
+ _IDENT = r'[`"\[]?([A-Za-z_][A-Za-z0-9_.$]*)[`"\]]?'
17
+
18
+
19
+ def strip_comments(sql: str) -> str:
20
+ sql = _BLOCK_COMMENT.sub(" ", sql)
21
+ sql = _LINE_COMMENT.sub("", sql)
22
+ return sql
23
+
24
+
25
+ def split_statements(sql: str) -> list[tuple[str, int]]:
26
+ """Split into (statement_text, line_number) on top-level semicolons.
27
+
28
+ Semicolons inside single/double quotes are ignored.
29
+ """
30
+ cleaned = strip_comments(sql)
31
+ statements: list[tuple[str, int]] = []
32
+ buf: list[str] = []
33
+ line = 1
34
+ start_line = 1
35
+ quote: str | None = None
36
+ for ch in cleaned:
37
+ if ch == "\n":
38
+ line += 1
39
+ if quote:
40
+ buf.append(ch)
41
+ if ch == quote:
42
+ quote = None
43
+ continue
44
+ if ch in ("'", '"'):
45
+ quote = ch
46
+ buf.append(ch)
47
+ continue
48
+ if ch == ";":
49
+ text = "".join(buf).strip()
50
+ if text:
51
+ statements.append((text, start_line))
52
+ buf = []
53
+ start_line = line
54
+ continue
55
+ if not buf and ch.strip() == "":
56
+ start_line = line
57
+ buf.append(ch)
58
+ tail = "".join(buf).strip()
59
+ if tail:
60
+ statements.append((tail, start_line))
61
+ return statements
62
+
63
+
64
+ def classify(stmt: str) -> tuple[str, str]:
65
+ """Return (kind, target_name)."""
66
+ s = stmt.lstrip()
67
+ low = s.lower()
68
+
69
+ def grab(pat: str) -> str:
70
+ m = re.search(pat, s, re.IGNORECASE)
71
+ return m.group(1) if m else ""
72
+
73
+ if low.startswith("create") and re.search(r"create\s+(temp\w*\s+)?table", low):
74
+ return "create_table", grab(rf"create\s+(?:temp\w*\s+)?table\s+(?:if\s+not\s+exists\s+)?{_IDENT}")
75
+ if low.startswith("create") and "index" in low.split("(")[0]:
76
+ return "create_index", grab(rf"index\s+(?:if\s+not\s+exists\s+)?{_IDENT}")
77
+ if low.startswith("alter"):
78
+ return "alter", grab(rf"alter\s+table\s+{_IDENT}")
79
+ if low.startswith("truncate"):
80
+ return "truncate", grab(rf"truncate\s+(?:table\s+)?{_IDENT}")
81
+ if low.startswith("drop"):
82
+ return "drop", grab(rf"drop\s+\w+\s+(?:if\s+exists\s+)?{_IDENT}")
83
+ if low.startswith("insert"):
84
+ return "insert", grab(rf"insert\s+into\s+{_IDENT}")
85
+ if low.startswith("update"):
86
+ return "update", grab(rf"update\s+{_IDENT}")
87
+ if low.startswith("delete"):
88
+ return "delete", grab(rf"delete\s+from\s+{_IDENT}")
89
+ if low.startswith("select") or low.startswith("with"):
90
+ return "select", ""
91
+ return "other", ""
92
+
93
+
94
+ def statement_flags(kind: str, stmt: str) -> dict:
95
+ low = stmt.lower()
96
+ flags: dict = {}
97
+ if kind in ("update", "delete"):
98
+ flags["has_where"] = bool(re.search(r"\bwhere\b", low))
99
+ if kind == "drop":
100
+ flags["if_exists"] = "if exists" in low
101
+ flags["drops_column"] = bool(re.search(r"\bdrop\s+column\b", low)) # only via ALTER, but guard anyway
102
+ if kind == "select":
103
+ # SELECT * (not count(*))
104
+ flags["select_star"] = bool(re.search(r"select\s+\*", low))
105
+ if kind == "create_table":
106
+ flags["if_not_exists"] = "if not exists" in low
107
+ if kind == "alter":
108
+ adds_col = bool(re.search(r"\badd\s+(column\s+)?", low))
109
+ flags["adds_column"] = adds_col
110
+ flags["drops_column"] = bool(re.search(r"\bdrop\s+(column\s+)?", low))
111
+ flags["renames"] = bool(re.search(r"\brename\b", low))
112
+ # locking risk: ADD COLUMN ... NOT NULL without a DEFAULT rewrites the table
113
+ if adds_col and re.search(r"\bnot\s+null\b", low) and not re.search(r"\bdefault\b", low):
114
+ flags["add_not_null_without_default"] = True
115
+ return flags
116
+
117
+
118
+ def parse_create_table(stmt: str, target: str) -> SqlTable | None:
119
+ m = re.search(r"\((.*)\)", stmt, re.DOTALL)
120
+ if not m:
121
+ return SqlTable(name=target, columns=[], has_primary_key=False)
122
+ body = m.group(1)
123
+ columns: list[str] = []
124
+ has_pk = bool(re.search(r"primary\s+key", body, re.IGNORECASE))
125
+ for part in _split_top_level(body):
126
+ p = part.strip()
127
+ if not p:
128
+ continue
129
+ low = p.lower()
130
+ if low.startswith(("primary key", "foreign key", "unique", "constraint", "check", "index", "key ")):
131
+ continue
132
+ m2 = re.match(_IDENT, p)
133
+ if m2:
134
+ columns.append(m2.group(1))
135
+ if "primary key" in low:
136
+ has_pk = True
137
+ return SqlTable(name=target, columns=columns, has_primary_key=has_pk)
138
+
139
+
140
+ def _split_top_level(body: str) -> list[str]:
141
+ parts: list[str] = []
142
+ depth = 0
143
+ buf: list[str] = []
144
+ for ch in body:
145
+ if ch == "(":
146
+ depth += 1
147
+ elif ch == ")":
148
+ depth -= 1
149
+ if ch == "," and depth == 0:
150
+ parts.append("".join(buf))
151
+ buf = []
152
+ else:
153
+ buf.append(ch)
154
+ if buf:
155
+ parts.append("".join(buf))
156
+ return parts
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class SqlStatement(BaseModel):
9
+ """One SQL statement. Serialized to ``statements.jsonl``."""
10
+
11
+ kind: str # create_table / create_index / alter / drop / select / insert / update / delete / other
12
+ target: str = "" # table/index name when determinable
13
+ file: str = ""
14
+ lineno: int = 0
15
+ preview: str = "" # first ~100 chars, comments stripped
16
+ flags: dict[str, Any] = Field(default_factory=dict) # parser observations (has_where, if_exists, select_star, ...)
17
+
18
+
19
+ class SqlTable(BaseModel):
20
+ """A table declared by a CREATE TABLE. Serialized to ``tables.jsonl``."""
21
+
22
+ name: str
23
+ columns: list[str] = Field(default_factory=list)
24
+ has_primary_key: bool = False
25
+ file: str = ""
26
+ lineno: int = 0
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from tessera_core.models import ValidationFinding
6
+
7
+ from tessera_sql.schema import SqlStatement, SqlTable
8
+
9
+
10
+ def validate_sql_records(statements: list[SqlStatement], options: dict[str, Any]) -> list[ValidationFinding]:
11
+ findings: list[ValidationFinding] = []
12
+
13
+ if not statements:
14
+ findings.append(ValidationFinding(severity="info", code="no_statements",
15
+ message="no SQL statements found", field=None))
16
+ return findings
17
+
18
+ for s in statements:
19
+ loc = f"{s.file}:{s.lineno}"
20
+
21
+ def f(severity: str, code: str, message: str) -> ValidationFinding:
22
+ return ValidationFinding(severity=severity, code=code, message=message,
23
+ field="sql", metadata={"file": s.file, "lineno": s.lineno, "kind": s.kind})
24
+
25
+ if s.kind == "delete" and s.flags.get("has_where") is False:
26
+ findings.append(f("error", "delete_without_where",
27
+ f"{loc}: DELETE without WHERE removes every row"))
28
+ if s.kind == "update" and s.flags.get("has_where") is False:
29
+ findings.append(f("warning", "update_without_where",
30
+ f"{loc}: UPDATE without WHERE writes every row"))
31
+ if s.kind == "drop" and not s.flags.get("if_exists"):
32
+ findings.append(f("warning", "drop_without_if_exists",
33
+ f"{loc}: DROP without IF EXISTS fails if the object is absent"))
34
+ if s.kind == "select" and s.flags.get("select_star"):
35
+ findings.append(f("info", "select_star",
36
+ f"{loc}: SELECT * couples the query to column order/shape"))
37
+
38
+ # --- migration-safety rules (the costly, easy-to-miss ones) ---
39
+ if s.kind == "truncate":
40
+ findings.append(f("warning", "truncate_table",
41
+ f"{loc}: TRUNCATE removes all rows and is often non-transactional / non-reversible"))
42
+ if s.kind == "alter" and s.flags.get("add_not_null_without_default"):
43
+ findings.append(f("error", "add_not_null_without_default",
44
+ f"{loc}: ADD COLUMN NOT NULL without DEFAULT rewrites the table and fails on existing rows"))
45
+ if s.kind == "alter" and s.flags.get("drops_column"):
46
+ findings.append(f("warning", "drop_column",
47
+ f"{loc}: dropping a column is destructive and irreversible; ensure no code still reads it"))
48
+ if s.kind == "alter" and s.flags.get("renames"):
49
+ findings.append(f("warning", "rename_breaks_compatibility",
50
+ f"{loc}: RENAME breaks any code/queries referencing the old name; prefer add-new + backfill + drop-old"))
51
+ if s.kind == "create_table" and not s.flags.get("if_not_exists"):
52
+ findings.append(f("info", "create_table_without_if_not_exists",
53
+ f"{loc}: CREATE TABLE without IF NOT EXISTS is not idempotent if the migration re-runs"))
54
+
55
+ tables: list[SqlTable] = options.get("_tables", [])
56
+ for t in tables:
57
+ if not t.has_primary_key:
58
+ findings.append(
59
+ ValidationFinding(
60
+ severity="warning", code="table_without_primary_key",
61
+ message=f"{t.file}:{t.lineno}: table `{t.name}` has no PRIMARY KEY",
62
+ field="sql", metadata={"table": t.name, "file": t.file, "lineno": t.lineno},
63
+ )
64
+ )
65
+
66
+ return findings
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from tessera_core.models import RunContext
7
+
8
+ from tessera_sql.pack import SqlPack
9
+ from tessera_sql.parse import classify, parse_create_table, split_statements, statement_flags
10
+ from tessera_sql.schema import SqlStatement
11
+
12
+ REPO_ROOT = Path(__file__).resolve().parents[3]
13
+ SAMPLE = REPO_ROOT / "examples" / "sql" / "schema.sql"
14
+
15
+
16
+ # ---------- parsing ----------
17
+
18
+
19
+ def test_split_ignores_semicolons_in_strings():
20
+ sql = "INSERT INTO t VALUES ('a;b'); SELECT 1;"
21
+ stmts = [s for s, _ in split_statements(sql)]
22
+ assert len(stmts) == 2
23
+ assert "a;b" in stmts[0]
24
+
25
+
26
+ def test_split_strips_comments():
27
+ sql = "-- a comment\nSELECT 1; /* block\ncomment */ SELECT 2;"
28
+ stmts = [s for s, _ in split_statements(sql)]
29
+ assert len(stmts) == 2
30
+ assert "comment" not in " ".join(stmts)
31
+
32
+
33
+ def test_classify_kinds():
34
+ assert classify("CREATE TABLE users (id int)")[0] == "create_table"
35
+ assert classify("create index idx on t(a)")[0] == "create_index"
36
+ assert classify("ALTER TABLE users ADD COLUMN x int")[0] == "alter"
37
+ assert classify("DROP TABLE t")[0] == "drop"
38
+ assert classify("DELETE FROM t")[0] == "delete"
39
+ assert classify("UPDATE t SET a=1")[0] == "update"
40
+ assert classify("SELECT * FROM t")[0] == "select"
41
+
42
+
43
+ def test_classify_targets():
44
+ assert classify("CREATE TABLE users (id int)")[1] == "users"
45
+ assert classify("DELETE FROM sessions WHERE x=1")[1] == "sessions"
46
+
47
+
48
+ def test_flags():
49
+ assert statement_flags("delete", "DELETE FROM t")["has_where"] is False
50
+ assert statement_flags("delete", "DELETE FROM t WHERE a=1")["has_where"] is True
51
+ assert statement_flags("drop", "DROP TABLE t")["if_exists"] is False
52
+ assert statement_flags("drop", "DROP TABLE IF EXISTS t")["if_exists"] is True
53
+ assert statement_flags("select", "SELECT * FROM t")["select_star"] is True
54
+
55
+
56
+ def test_parse_create_table_columns_and_pk():
57
+ t = parse_create_table("CREATE TABLE users (id INTEGER PRIMARY KEY, email TEXT)", "users")
58
+ assert t.name == "users"
59
+ assert "id" in t.columns and "email" in t.columns
60
+ assert t.has_primary_key is True
61
+
62
+ t2 = parse_create_table("CREATE TABLE logs (message TEXT, created_at TIMESTAMP)", "logs")
63
+ assert t2.has_primary_key is False
64
+
65
+
66
+ # ---------- end-to-end ----------
67
+
68
+
69
+ def _run(tmp_path: Path):
70
+ out = tmp_path / "sql_pack"
71
+ ctx = RunContext(job_name="sql", output_dir=out)
72
+ SqlPack().run(input_path=SAMPLE, ctx=ctx, options={})
73
+ return out, ctx
74
+
75
+
76
+ def test_findings(tmp_path: Path):
77
+ _, ctx = _run(tmp_path)
78
+ codes = {f.code for f in ctx.metadata["findings"]}
79
+ assert "delete_without_where" in codes # DELETE FROM sessions;
80
+ assert "update_without_where" in codes # UPDATE users SET active=false;
81
+ assert "drop_without_if_exists" in codes # DROP TABLE temp_data;
82
+ assert "table_without_primary_key" in codes # logs
83
+ assert "select_star" in codes # SELECT * FROM users
84
+
85
+
86
+ def test_safe_statements_not_flagged(tmp_path: Path):
87
+ _, ctx = _run(tmp_path)
88
+ # guarded delete + guarded drop should not raise their dangerous-variant codes for those lines
89
+ delete_findings = [f for f in ctx.metadata["findings"] if f.code == "delete_without_where"]
90
+ # only the unguarded DELETE should be flagged, not the WHERE one
91
+ assert len(delete_findings) == 1
92
+
93
+
94
+ def test_migration_safety_findings(tmp_path: Path):
95
+ _, ctx = _run(tmp_path)
96
+ codes = {f.code for f in ctx.metadata["findings"]}
97
+ assert "add_not_null_without_default" in codes # ALTER ... ADD COLUMN phone TEXT NOT NULL
98
+ assert "drop_column" in codes # ALTER ... DROP COLUMN active
99
+ assert "rename_breaks_compatibility" in codes # ALTER ... RENAME TO audit_logs
100
+ assert "truncate_table" in codes # TRUNCATE TABLE audit_logs
101
+ assert "create_table_without_if_not_exists" in codes # users / logs
102
+
103
+
104
+ def test_add_column_with_default_not_flagged(tmp_path: Path):
105
+ """ADD COLUMN ... DEFAULT '' is safe and must not raise add_not_null_without_default."""
106
+ _, ctx = _run(tmp_path)
107
+ offenders = [
108
+ f for f in ctx.metadata["findings"]
109
+ if f.code == "add_not_null_without_default"
110
+ ]
111
+ # only the phone column (no default) should be flagged, not nickname (has default)
112
+ assert len(offenders) == 1
113
+
114
+
115
+ def test_if_not_exists_create_not_flagged(tmp_path: Path):
116
+ """CREATE TABLE IF NOT EXISTS settings must not trigger the idempotency info."""
117
+ from tessera_sql.parse import classify, statement_flags
118
+ kind, _ = classify("CREATE TABLE IF NOT EXISTS settings (id INTEGER PRIMARY KEY)")
119
+ flags = statement_flags(kind, "CREATE TABLE IF NOT EXISTS settings (id INTEGER PRIMARY KEY)")
120
+ assert kind == "create_table"
121
+ assert flags["if_not_exists"] is True
122
+
123
+
124
+ def test_truncate_classified():
125
+ from tessera_sql.parse import classify
126
+ kind, target = classify("TRUNCATE TABLE audit_logs")
127
+ assert kind == "truncate"
128
+ assert target == "audit_logs"
129
+
130
+
131
+ def test_artifacts_and_tables(tmp_path: Path):
132
+ out, _ = _run(tmp_path)
133
+ names = {p.name for p in out.iterdir()}
134
+ assert {
135
+ "statements.jsonl", "tables.jsonl", "index.md",
136
+ "validation_report.md", "coverage_report.md", "tables.md",
137
+ } <= names
138
+ tables = [json.loads(l) for l in (out / "tables.jsonl").read_text().splitlines()]
139
+ by_name = {t["name"]: t for t in tables}
140
+ assert by_name["users"]["has_primary_key"] is True
141
+ assert by_name["logs"]["has_primary_key"] is False
142
+
143
+
144
+ def test_statement_round_trip(tmp_path: Path):
145
+ out, _ = _run(tmp_path)
146
+ for line in (out / "statements.jsonl").read_text().splitlines():
147
+ s = SqlStatement.model_validate_json(line)
148
+ assert s.kind