sqf-py 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqf/__init__.py +70 -0
- sqf/analyzer.py +239 -0
- sqf/benchmark.py +488 -0
- sqf/fingerprint.py +139 -0
- sqf/generator.py +647 -0
- sqf/normalizer.py +318 -0
- sqf/snowflake.py +460 -0
- sqf/sql/001_create_cluster_store.sql +74 -0
- sqf/sql/002_hit_rate_views.sql +186 -0
- sqf/sql/003_query_history_export.sql +31 -0
- sqf_py-0.1.0.dist-info/METADATA +191 -0
- sqf_py-0.1.0.dist-info/RECORD +15 -0
- sqf_py-0.1.0.dist-info/WHEEL +5 -0
- sqf_py-0.1.0.dist-info/licenses/LICENSE +21 -0
- sqf_py-0.1.0.dist-info/top_level.txt +1 -0
sqf/__init__.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sqf — Semantic Query Fingerprinting for Snowflake
|
|
3
|
+
==================================================
|
|
4
|
+
|
|
5
|
+
A Python library that assigns a stable, content-addressed fingerprint to any
|
|
6
|
+
SQL query by normalizing away syntactic noise. Queries that are logically
|
|
7
|
+
identical but written differently collapse to the same fingerprint, enabling
|
|
8
|
+
deduplication analysis, cost attribution, and query-cache optimization.
|
|
9
|
+
|
|
10
|
+
Quick start::
|
|
11
|
+
|
|
12
|
+
from sqf import fingerprint, are_equivalent, SQFAnalyzer
|
|
13
|
+
|
|
14
|
+
# Single query fingerprinting
|
|
15
|
+
h = fingerprint("SELECT a, b FROM t WHERE id = 1")
|
|
16
|
+
|
|
17
|
+
# Equivalence check
|
|
18
|
+
are_equivalent(
|
|
19
|
+
"SELECT a AS col1, b AS col2 FROM t WHERE id = 1",
|
|
20
|
+
"SELECT b, a FROM t WHERE id = ?",
|
|
21
|
+
) # → True
|
|
22
|
+
|
|
23
|
+
# Bulk workload analysis
|
|
24
|
+
analyzer = SQFAnalyzer()
|
|
25
|
+
analyzer.ingest_sql(my_query_list)
|
|
26
|
+
print(analyzer.report().summary())
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from .fingerprint import (
|
|
30
|
+
fingerprint,
|
|
31
|
+
canonical_form,
|
|
32
|
+
are_equivalent,
|
|
33
|
+
QueryRecord,
|
|
34
|
+
SQFCluster,
|
|
35
|
+
)
|
|
36
|
+
from .normalizer import normalize
|
|
37
|
+
from .analyzer import SQFAnalyzer, SQFReport
|
|
38
|
+
from .generator import SyntheticWorkloadGenerator, FAMILIES, FAMILY_BY_ID
|
|
39
|
+
from .snowflake import SnowflakeIngestor, ClusterStore, load_sql, SQL_FILES
|
|
40
|
+
from .benchmark import (
|
|
41
|
+
BenchmarkRun,
|
|
42
|
+
BenchmarkSuite,
|
|
43
|
+
run_single,
|
|
44
|
+
run_benchmark_suite,
|
|
45
|
+
make_charts,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
__version__ = "0.1.0"
|
|
49
|
+
__all__ = [
|
|
50
|
+
"fingerprint",
|
|
51
|
+
"canonical_form",
|
|
52
|
+
"are_equivalent",
|
|
53
|
+
"normalize",
|
|
54
|
+
"QueryRecord",
|
|
55
|
+
"SQFCluster",
|
|
56
|
+
"SQFAnalyzer",
|
|
57
|
+
"SQFReport",
|
|
58
|
+
"SyntheticWorkloadGenerator",
|
|
59
|
+
"FAMILIES",
|
|
60
|
+
"FAMILY_BY_ID",
|
|
61
|
+
"SnowflakeIngestor",
|
|
62
|
+
"ClusterStore",
|
|
63
|
+
"load_sql",
|
|
64
|
+
"SQL_FILES",
|
|
65
|
+
"BenchmarkRun",
|
|
66
|
+
"BenchmarkSuite",
|
|
67
|
+
"run_single",
|
|
68
|
+
"run_benchmark_suite",
|
|
69
|
+
"make_charts",
|
|
70
|
+
]
|
sqf/analyzer.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sqf.analyzer
|
|
3
|
+
------------
|
|
4
|
+
SQFAnalyzer: ingests a list of QueryRecord objects (or raw SQL strings),
|
|
5
|
+
groups them into SQFClusters, and produces deduplication + cost metrics.
|
|
6
|
+
|
|
7
|
+
This is the layer that connects to Snowflake's QUERY_HISTORY in production,
|
|
8
|
+
or to the synthetic generator in benchmarks.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Iterable
|
|
16
|
+
|
|
17
|
+
from .fingerprint import QueryRecord, SQFCluster, fingerprint, canonical_form
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Analyzer
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
class SQFAnalyzer:
|
|
25
|
+
"""
|
|
26
|
+
Core analysis engine.
|
|
27
|
+
|
|
28
|
+
Example::
|
|
29
|
+
|
|
30
|
+
analyzer = SQFAnalyzer()
|
|
31
|
+
analyzer.ingest(records) # list[QueryRecord]
|
|
32
|
+
report = analyzer.report()
|
|
33
|
+
print(report.summary())
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, dialect: str = "snowflake"):
|
|
37
|
+
self.dialect = dialect
|
|
38
|
+
self._clusters: dict[str, SQFCluster] = {}
|
|
39
|
+
self._unparseable: list[QueryRecord] = []
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# Ingestion
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def ingest(self, records: Iterable[QueryRecord]) -> "SQFAnalyzer":
|
|
46
|
+
"""
|
|
47
|
+
Fingerprint each record and add it to the appropriate cluster.
|
|
48
|
+
Returns self for chaining.
|
|
49
|
+
"""
|
|
50
|
+
for rec in records:
|
|
51
|
+
rec.compute_fingerprint(dialect=self.dialect)
|
|
52
|
+
if rec.sqf_hash is None:
|
|
53
|
+
self._unparseable.append(rec)
|
|
54
|
+
continue
|
|
55
|
+
if rec.sqf_hash not in self._clusters:
|
|
56
|
+
self._clusters[rec.sqf_hash] = SQFCluster(
|
|
57
|
+
sqf_hash=rec.sqf_hash,
|
|
58
|
+
canonical_form=rec.canonical or "",
|
|
59
|
+
)
|
|
60
|
+
self._clusters[rec.sqf_hash].records.append(rec)
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
def ingest_sql(
|
|
64
|
+
self,
|
|
65
|
+
queries: Iterable[str],
|
|
66
|
+
credits_per_query: float = 0.01,
|
|
67
|
+
) -> "SQFAnalyzer":
|
|
68
|
+
"""
|
|
69
|
+
Convenience method: ingest plain SQL strings with uniform cost.
|
|
70
|
+
"""
|
|
71
|
+
records = [
|
|
72
|
+
QueryRecord(
|
|
73
|
+
query_id=f"q{i:06d}",
|
|
74
|
+
sql=sql,
|
|
75
|
+
credits_used=credits_per_query,
|
|
76
|
+
)
|
|
77
|
+
for i, sql in enumerate(queries)
|
|
78
|
+
]
|
|
79
|
+
return self.ingest(records)
|
|
80
|
+
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
# Accessors
|
|
83
|
+
# ------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def clusters(self) -> list[SQFCluster]:
|
|
87
|
+
return list(self._clusters.values())
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def duplicate_clusters(self) -> list[SQFCluster]:
|
|
91
|
+
"""Clusters with more than one execution (i.e. actual duplicates)."""
|
|
92
|
+
return [c for c in self.clusters if c.size > 1]
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def total_queries(self) -> int:
|
|
96
|
+
return sum(c.size for c in self.clusters) + len(self._unparseable)
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def total_credits(self) -> float:
|
|
100
|
+
return sum(c.total_credits for c in self.clusters)
|
|
101
|
+
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
# Report
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def report(self) -> "SQFReport":
|
|
107
|
+
return SQFReport(
|
|
108
|
+
clusters=self.clusters,
|
|
109
|
+
unparseable_count=len(self._unparseable),
|
|
110
|
+
dialect=self.dialect,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# Report
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class SQFReport:
|
|
120
|
+
"""
|
|
121
|
+
Immutable analysis report. Produced by SQFAnalyzer.report().
|
|
122
|
+
"""
|
|
123
|
+
clusters: list[SQFCluster]
|
|
124
|
+
unparseable_count: int = 0
|
|
125
|
+
dialect: str = "snowflake"
|
|
126
|
+
|
|
127
|
+
# ------------------------------------------------------------------
|
|
128
|
+
# Core metrics
|
|
129
|
+
# ------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def total_executions(self) -> int:
|
|
133
|
+
return sum(c.size for c in self.clusters)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def unique_fingerprints(self) -> int:
|
|
137
|
+
return len(self.clusters)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def duplicate_executions(self) -> int:
|
|
141
|
+
"""Total executions that are semantic duplicates of a prior run."""
|
|
142
|
+
return sum(max(0, c.size - 1) for c in self.clusters)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def dedup_hit_rate(self) -> float:
|
|
146
|
+
"""
|
|
147
|
+
Fraction of executions that are semantic duplicates.
|
|
148
|
+
This is the headline metric for the white paper.
|
|
149
|
+
"""
|
|
150
|
+
if self.total_executions == 0:
|
|
151
|
+
return 0.0
|
|
152
|
+
return self.duplicate_executions / self.total_executions
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def total_credits(self) -> float:
|
|
156
|
+
return sum(c.total_credits for c in self.clusters)
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def wasted_credits(self) -> float:
|
|
160
|
+
return sum(c.wasted_credits for c in self.clusters)
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def credit_waste_rate(self) -> float:
|
|
164
|
+
if self.total_credits == 0:
|
|
165
|
+
return 0.0
|
|
166
|
+
return self.wasted_credits / self.total_credits
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def multi_variant_clusters(self) -> list[SQFCluster]:
|
|
170
|
+
"""
|
|
171
|
+
Clusters where the same logical query was written in multiple
|
|
172
|
+
syntactically distinct ways — the most compelling white paper examples.
|
|
173
|
+
"""
|
|
174
|
+
return [
|
|
175
|
+
c for c in self.clusters
|
|
176
|
+
if c.size > 1 and c.syntactic_variant_count > 1
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
# ------------------------------------------------------------------
|
|
180
|
+
# Top offenders
|
|
181
|
+
# ------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
def top_clusters_by_waste(self, n: int = 10) -> list[SQFCluster]:
|
|
184
|
+
return sorted(
|
|
185
|
+
[c for c in self.clusters if c.wasted_credits > 0],
|
|
186
|
+
key=lambda c: c.wasted_credits,
|
|
187
|
+
reverse=True,
|
|
188
|
+
)[:n]
|
|
189
|
+
|
|
190
|
+
def top_clusters_by_size(self, n: int = 10) -> list[SQFCluster]:
|
|
191
|
+
return sorted(self.clusters, key=lambda c: c.size, reverse=True)[:n]
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------
|
|
194
|
+
# Summary text
|
|
195
|
+
# ------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
def summary(self) -> str:
|
|
198
|
+
lines = [
|
|
199
|
+
"═" * 60,
|
|
200
|
+
" Semantic Query Fingerprint (SQF) Analysis Report",
|
|
201
|
+
"═" * 60,
|
|
202
|
+
f" Total query executions : {self.total_executions:>8,}",
|
|
203
|
+
f" Unique SQF fingerprints : {self.unique_fingerprints:>8,}",
|
|
204
|
+
f" Duplicate executions : {self.duplicate_executions:>8,}",
|
|
205
|
+
f" Unparseable queries : {self.unparseable_count:>8,}",
|
|
206
|
+
"─" * 60,
|
|
207
|
+
f" Dedup hit rate : {self.dedup_hit_rate:>8.1%}",
|
|
208
|
+
f" Total credits consumed : {self.total_credits:>8.4f}",
|
|
209
|
+
f" Credits wasted : {self.wasted_credits:>8.4f}",
|
|
210
|
+
f" Credit waste rate : {self.credit_waste_rate:>8.1%}",
|
|
211
|
+
"─" * 60,
|
|
212
|
+
f" Multi-variant clusters : {len(self.multi_variant_clusters):>8,}",
|
|
213
|
+
"═" * 60,
|
|
214
|
+
]
|
|
215
|
+
return "\n".join(lines)
|
|
216
|
+
|
|
217
|
+
def to_dict(self) -> dict:
|
|
218
|
+
"""Serialize report metrics to a dict (for JSON export / charts)."""
|
|
219
|
+
return {
|
|
220
|
+
"total_executions": self.total_executions,
|
|
221
|
+
"unique_fingerprints": self.unique_fingerprints,
|
|
222
|
+
"duplicate_executions": self.duplicate_executions,
|
|
223
|
+
"unparseable_count": self.unparseable_count,
|
|
224
|
+
"dedup_hit_rate": round(self.dedup_hit_rate, 4),
|
|
225
|
+
"total_credits": round(self.total_credits, 6),
|
|
226
|
+
"wasted_credits": round(self.wasted_credits, 6),
|
|
227
|
+
"credit_waste_rate": round(self.credit_waste_rate, 4),
|
|
228
|
+
"multi_variant_clusters": len(self.multi_variant_clusters),
|
|
229
|
+
"top_clusters": [
|
|
230
|
+
{
|
|
231
|
+
"sqf_hash": c.sqf_hash[:12] + "...",
|
|
232
|
+
"size": c.size,
|
|
233
|
+
"syntactic_variants": c.syntactic_variant_count,
|
|
234
|
+
"wasted_credits": round(c.wasted_credits, 6),
|
|
235
|
+
"canonical_preview": c.canonical_form[:120],
|
|
236
|
+
}
|
|
237
|
+
for c in self.top_clusters_by_waste(5)
|
|
238
|
+
],
|
|
239
|
+
}
|