sqf-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqf/__init__.py ADDED
@@ -0,0 +1,70 @@
1
+ """
2
+ sqf — Semantic Query Fingerprinting for Snowflake
3
+ ==================================================
4
+
5
+ A Python library that assigns a stable, content-addressed fingerprint to any
6
+ SQL query by normalizing away syntactic noise. Queries that are logically
7
+ identical but written differently collapse to the same fingerprint, enabling
8
+ deduplication analysis, cost attribution, and query-cache optimization.
9
+
10
+ Quick start::
11
+
12
+ from sqf import fingerprint, are_equivalent, SQFAnalyzer
13
+
14
+ # Single query fingerprinting
15
+ h = fingerprint("SELECT a, b FROM t WHERE id = 1")
16
+
17
+ # Equivalence check
18
+ are_equivalent(
19
+ "SELECT a AS col1, b AS col2 FROM t WHERE id = 1",
20
+ "SELECT b, a FROM t WHERE id = ?",
21
+ ) # → True
22
+
23
+ # Bulk workload analysis
24
+ analyzer = SQFAnalyzer()
25
+ analyzer.ingest_sql(my_query_list)
26
+ print(analyzer.report().summary())
27
+ """
28
+
29
+ from .fingerprint import (
30
+ fingerprint,
31
+ canonical_form,
32
+ are_equivalent,
33
+ QueryRecord,
34
+ SQFCluster,
35
+ )
36
+ from .normalizer import normalize
37
+ from .analyzer import SQFAnalyzer, SQFReport
38
+ from .generator import SyntheticWorkloadGenerator, FAMILIES, FAMILY_BY_ID
39
+ from .snowflake import SnowflakeIngestor, ClusterStore, load_sql, SQL_FILES
40
+ from .benchmark import (
41
+ BenchmarkRun,
42
+ BenchmarkSuite,
43
+ run_single,
44
+ run_benchmark_suite,
45
+ make_charts,
46
+ )
47
+
48
+ __version__ = "0.1.0"
49
+ __all__ = [
50
+ "fingerprint",
51
+ "canonical_form",
52
+ "are_equivalent",
53
+ "normalize",
54
+ "QueryRecord",
55
+ "SQFCluster",
56
+ "SQFAnalyzer",
57
+ "SQFReport",
58
+ "SyntheticWorkloadGenerator",
59
+ "FAMILIES",
60
+ "FAMILY_BY_ID",
61
+ "SnowflakeIngestor",
62
+ "ClusterStore",
63
+ "load_sql",
64
+ "SQL_FILES",
65
+ "BenchmarkRun",
66
+ "BenchmarkSuite",
67
+ "run_single",
68
+ "run_benchmark_suite",
69
+ "make_charts",
70
+ ]
sqf/analyzer.py ADDED
@@ -0,0 +1,239 @@
1
+ """
2
+ sqf.analyzer
3
+ ------------
4
+ SQFAnalyzer: ingests a list of QueryRecord objects (or raw SQL strings),
5
+ groups them into SQFClusters, and produces deduplication + cost metrics.
6
+
7
+ This is the layer that connects to Snowflake's QUERY_HISTORY in production,
8
+ or to the synthetic generator in benchmarks.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections import defaultdict
14
+ from dataclasses import dataclass, field
15
+ from typing import Iterable
16
+
17
+ from .fingerprint import QueryRecord, SQFCluster, fingerprint, canonical_form
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Analyzer
22
+ # ---------------------------------------------------------------------------
23
+
24
+ class SQFAnalyzer:
25
+ """
26
+ Core analysis engine.
27
+
28
+ Example::
29
+
30
+ analyzer = SQFAnalyzer()
31
+ analyzer.ingest(records) # list[QueryRecord]
32
+ report = analyzer.report()
33
+ print(report.summary())
34
+ """
35
+
36
+ def __init__(self, dialect: str = "snowflake"):
37
+ self.dialect = dialect
38
+ self._clusters: dict[str, SQFCluster] = {}
39
+ self._unparseable: list[QueryRecord] = []
40
+
41
+ # ------------------------------------------------------------------
42
+ # Ingestion
43
+ # ------------------------------------------------------------------
44
+
45
+ def ingest(self, records: Iterable[QueryRecord]) -> "SQFAnalyzer":
46
+ """
47
+ Fingerprint each record and add it to the appropriate cluster.
48
+ Returns self for chaining.
49
+ """
50
+ for rec in records:
51
+ rec.compute_fingerprint(dialect=self.dialect)
52
+ if rec.sqf_hash is None:
53
+ self._unparseable.append(rec)
54
+ continue
55
+ if rec.sqf_hash not in self._clusters:
56
+ self._clusters[rec.sqf_hash] = SQFCluster(
57
+ sqf_hash=rec.sqf_hash,
58
+ canonical_form=rec.canonical or "",
59
+ )
60
+ self._clusters[rec.sqf_hash].records.append(rec)
61
+ return self
62
+
63
+ def ingest_sql(
64
+ self,
65
+ queries: Iterable[str],
66
+ credits_per_query: float = 0.01,
67
+ ) -> "SQFAnalyzer":
68
+ """
69
+ Convenience method: ingest plain SQL strings with uniform cost.
70
+ """
71
+ records = [
72
+ QueryRecord(
73
+ query_id=f"q{i:06d}",
74
+ sql=sql,
75
+ credits_used=credits_per_query,
76
+ )
77
+ for i, sql in enumerate(queries)
78
+ ]
79
+ return self.ingest(records)
80
+
81
+ # ------------------------------------------------------------------
82
+ # Accessors
83
+ # ------------------------------------------------------------------
84
+
85
+ @property
86
+ def clusters(self) -> list[SQFCluster]:
87
+ return list(self._clusters.values())
88
+
89
+ @property
90
+ def duplicate_clusters(self) -> list[SQFCluster]:
91
+ """Clusters with more than one execution (i.e. actual duplicates)."""
92
+ return [c for c in self.clusters if c.size > 1]
93
+
94
+ @property
95
+ def total_queries(self) -> int:
96
+ return sum(c.size for c in self.clusters) + len(self._unparseable)
97
+
98
+ @property
99
+ def total_credits(self) -> float:
100
+ return sum(c.total_credits for c in self.clusters)
101
+
102
+ # ------------------------------------------------------------------
103
+ # Report
104
+ # ------------------------------------------------------------------
105
+
106
+ def report(self) -> "SQFReport":
107
+ return SQFReport(
108
+ clusters=self.clusters,
109
+ unparseable_count=len(self._unparseable),
110
+ dialect=self.dialect,
111
+ )
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Report
116
+ # ---------------------------------------------------------------------------
117
+
118
+ @dataclass
119
+ class SQFReport:
120
+ """
121
+ Immutable analysis report. Produced by SQFAnalyzer.report().
122
+ """
123
+ clusters: list[SQFCluster]
124
+ unparseable_count: int = 0
125
+ dialect: str = "snowflake"
126
+
127
+ # ------------------------------------------------------------------
128
+ # Core metrics
129
+ # ------------------------------------------------------------------
130
+
131
+ @property
132
+ def total_executions(self) -> int:
133
+ return sum(c.size for c in self.clusters)
134
+
135
+ @property
136
+ def unique_fingerprints(self) -> int:
137
+ return len(self.clusters)
138
+
139
+ @property
140
+ def duplicate_executions(self) -> int:
141
+ """Total executions that are semantic duplicates of a prior run."""
142
+ return sum(max(0, c.size - 1) for c in self.clusters)
143
+
144
+ @property
145
+ def dedup_hit_rate(self) -> float:
146
+ """
147
+ Fraction of executions that are semantic duplicates.
148
+ This is the headline metric for the white paper.
149
+ """
150
+ if self.total_executions == 0:
151
+ return 0.0
152
+ return self.duplicate_executions / self.total_executions
153
+
154
+ @property
155
+ def total_credits(self) -> float:
156
+ return sum(c.total_credits for c in self.clusters)
157
+
158
+ @property
159
+ def wasted_credits(self) -> float:
160
+ return sum(c.wasted_credits for c in self.clusters)
161
+
162
+ @property
163
+ def credit_waste_rate(self) -> float:
164
+ if self.total_credits == 0:
165
+ return 0.0
166
+ return self.wasted_credits / self.total_credits
167
+
168
+ @property
169
+ def multi_variant_clusters(self) -> list[SQFCluster]:
170
+ """
171
+ Clusters where the same logical query was written in multiple
172
+ syntactically distinct ways — the most compelling white paper examples.
173
+ """
174
+ return [
175
+ c for c in self.clusters
176
+ if c.size > 1 and c.syntactic_variant_count > 1
177
+ ]
178
+
179
+ # ------------------------------------------------------------------
180
+ # Top offenders
181
+ # ------------------------------------------------------------------
182
+
183
+ def top_clusters_by_waste(self, n: int = 10) -> list[SQFCluster]:
184
+ return sorted(
185
+ [c for c in self.clusters if c.wasted_credits > 0],
186
+ key=lambda c: c.wasted_credits,
187
+ reverse=True,
188
+ )[:n]
189
+
190
+ def top_clusters_by_size(self, n: int = 10) -> list[SQFCluster]:
191
+ return sorted(self.clusters, key=lambda c: c.size, reverse=True)[:n]
192
+
193
+ # ------------------------------------------------------------------
194
+ # Summary text
195
+ # ------------------------------------------------------------------
196
+
197
+ def summary(self) -> str:
198
+ lines = [
199
+ "═" * 60,
200
+ " Semantic Query Fingerprint (SQF) Analysis Report",
201
+ "═" * 60,
202
+ f" Total query executions : {self.total_executions:>8,}",
203
+ f" Unique SQF fingerprints : {self.unique_fingerprints:>8,}",
204
+ f" Duplicate executions : {self.duplicate_executions:>8,}",
205
+ f" Unparseable queries : {self.unparseable_count:>8,}",
206
+ "─" * 60,
207
+ f" Dedup hit rate : {self.dedup_hit_rate:>8.1%}",
208
+ f" Total credits consumed : {self.total_credits:>8.4f}",
209
+ f" Credits wasted : {self.wasted_credits:>8.4f}",
210
+ f" Credit waste rate : {self.credit_waste_rate:>8.1%}",
211
+ "─" * 60,
212
+ f" Multi-variant clusters : {len(self.multi_variant_clusters):>8,}",
213
+ "═" * 60,
214
+ ]
215
+ return "\n".join(lines)
216
+
217
+ def to_dict(self) -> dict:
218
+ """Serialize report metrics to a dict (for JSON export / charts)."""
219
+ return {
220
+ "total_executions": self.total_executions,
221
+ "unique_fingerprints": self.unique_fingerprints,
222
+ "duplicate_executions": self.duplicate_executions,
223
+ "unparseable_count": self.unparseable_count,
224
+ "dedup_hit_rate": round(self.dedup_hit_rate, 4),
225
+ "total_credits": round(self.total_credits, 6),
226
+ "wasted_credits": round(self.wasted_credits, 6),
227
+ "credit_waste_rate": round(self.credit_waste_rate, 4),
228
+ "multi_variant_clusters": len(self.multi_variant_clusters),
229
+ "top_clusters": [
230
+ {
231
+ "sqf_hash": c.sqf_hash[:12] + "...",
232
+ "size": c.size,
233
+ "syntactic_variants": c.syntactic_variant_count,
234
+ "wasted_credits": round(c.wasted_credits, 6),
235
+ "canonical_preview": c.canonical_form[:120],
236
+ }
237
+ for c in self.top_clusters_by_waste(5)
238
+ ],
239
+ }