sqf-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqf_py-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pragya Verma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sqf_py-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,191 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqf-py
3
+ Version: 0.1.0
4
+ Summary: Semantic Query Fingerprinting for Snowflake — collapse syntactically different but logically identical SQL queries to a canonical fingerprint
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/vermapragya/sqf-py
7
+ Project-URL: White Paper, https://github.com/vermapragya/sqf-py/blob/main/WHITEPAPER.md
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: sqlglot>=25.0.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=7.0; extra == "dev"
14
+ Requires-Dist: pytest-cov; extra == "dev"
15
+ Provides-Extra: snowflake
16
+ Requires-Dist: snowflake-connector-python>=3.0; extra == "snowflake"
17
+ Provides-Extra: bench
18
+ Requires-Dist: matplotlib>=3.7; extra == "bench"
19
+ Dynamic: license-file
20
+
21
+ # sqf-py — Semantic Query Fingerprinting for Snowflake
22
+
23
+ [![Tests](https://img.shields.io/badge/tests-68%20passed-brightgreen)]()
24
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)]()
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)]()
26
+
27
+ **sqf-py** assigns a stable, content-addressed fingerprint to any SQL query by normalizing away syntactic noise. Queries that are *logically identical* but *written differently* collapse to the same fingerprint — enabling deduplication analysis, cost attribution, and query-cache optimization on Snowflake.
28
+
29
+ Accompanies the white paper: [**Semantic Query Deduplication in Cloud Data Warehouses**](WHITEPAPER.md).
30
+
31
+ ---
32
+
33
+ ## The Problem
34
+
35
+ Modern data warehouses are bombarded with semantically identical queries that look different:
36
+
37
+ ```sql
38
+ -- BI tool A (Looker)
39
+ SELECT o.user_id AS uid, SUM(o.amount) AS revenue
40
+ FROM orders AS o WHERE o.status = 'complete' AND o.created_at > '2024-01-01'
41
+ GROUP BY 1
42
+
43
+ -- BI tool B (Tableau)
44
+ SELECT SUM(amount) AS revenue, user_id AS uid
45
+ FROM orders WHERE created_at > '2023-06-01' AND status = 'active'
46
+ GROUP BY uid
47
+ ```
48
+
49
+ These are logically the same query template. Snowflake's text-keyed result cache treats them as distinct — burning compute on every re-execution. sqf-py proves they're duplicates: both fingerprint to `3c1a8c600789df69…`.
50
+
51
+ On a synthetic 10,000-query BI-style workload, sqf-py identifies **99.7% of executions as semantic duplicates**, at **~440 queries/second** analyzed client-side. See [the white paper](WHITEPAPER.md) for methodology and caveats.
52
+
53
+ ---
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ pip install sqf-py # core library (sqlglot only)
59
+ pip install "sqf-py[snowflake]" # + Snowflake connector
60
+ pip install "sqf-py[bench]" # + matplotlib for benchmark charts
61
+ pip install "sqf-py[dev]" # + pytest
62
+ ```
63
+
64
+ ---
65
+
66
+ ## Quick Start
67
+
68
+ ```python
69
+ from sqf import fingerprint, are_equivalent, canonical_form, SQFAnalyzer
70
+
71
+ # Single fingerprint
72
+ fp = fingerprint("SELECT a, b FROM t WHERE id = 1")
73
+ # → "3f4a1b9c..." (64-char hex, stable)
74
+
75
+ # Equivalence check — these two queries are semantically identical
76
+ q1 = "SELECT a AS col1, b AS col2 FROM t WHERE id = 99"
77
+ q2 = "SELECT b, a FROM t WHERE id = 1"
78
+ are_equivalent(q1, q2) # → True
79
+
80
+ # See the canonical form
81
+ canonical_form("SELECT a AS x, b AS y FROM t WHERE id = 42")
82
+ # → "SELECT A, B FROM T WHERE ID = ?"
83
+
84
+ # Bulk workload analysis
85
+ analyzer = SQFAnalyzer()
86
+ analyzer.ingest_sql(my_query_list, credits_per_query=0.05)
87
+ print(analyzer.report().summary())
88
+ ```
89
+
90
+ ---
91
+
92
+ ## Normalization Pipeline
93
+
94
+ The SQF algorithm applies these passes in order:
95
+
96
+ | Pass | What it does | Example |
97
+ |------|-------------|---------|
98
+ | 1. GROUP BY reference resolution | `GROUP BY 1` / `GROUP BY alias` → actual expression | `GROUP BY user_id` |
99
+ | 2. Alias stripping | Remove all `AS` aliases and table qualifiers | `SELECT o.a AS x` → `SELECT a` |
100
+ | 3. Column sort | Sort SELECT list alphabetically | `SELECT b, a` → `SELECT a, b` |
101
+ | 4. GROUP BY sort | Sort GROUP BY keys | `GROUP BY b, a` → `GROUP BY a, b` |
102
+ | 5. Predicate canonicalization | Sort AND/OR operands recursively | `WHERE b=2 AND a=1` → `WHERE a=1 AND b=2` |
103
+ | 6. CTE inlining | Inline single-reference CTEs | `WITH x AS (...) SELECT ... FROM x` → subquery |
104
+ | 7. Literal abstraction | Replace all values with `?` | `WHERE id = 42` → `WHERE id = ?` |
105
+ | 8. Whitespace collapse + uppercase | Canonical string form | |
106
+ | **Hash** | SHA-256 of canonical string | 64-char hex fingerprint |
107
+
108
+ The precise equivalence class (and its deliberate trade-offs) is defined in [§2 of the white paper](WHITEPAPER.md).
109
+
110
+ ---
111
+
112
+ ## Analyzing a Snowflake Workload
113
+
114
+ ```python
115
+ from sqf import SnowflakeIngestor, ClusterStore, SQFAnalyzer
116
+ import snowflake.connector
117
+
118
+ conn = snowflake.connector.connect(...) # your credentials
119
+
120
+ # 1. Pull the last 30 days of QUERY_HISTORY
121
+ records = SnowflakeIngestor(conn, lookback_days=30, row_limit=50_000).fetch_records()
122
+
123
+ # 2. Fingerprint + cluster
124
+ report = SQFAnalyzer().ingest(records).report()
125
+ print(report.summary())
126
+ # ═══════════════════════════════════════════════════════════
127
+ # Semantic Query Fingerprint (SQF) Analysis Report
128
+ # ═══════════════════════════════════════════════════════════
129
+ # Total query executions : 12,847
130
+ # Unique SQF fingerprints : 4,203
131
+ # Dedup hit rate : 67.3%
132
+ # Credits wasted : 86.4800
133
+ # ...
134
+
135
+ # 3. Persist results back to Snowflake (idempotent MERGEs)
136
+ store = ClusterStore(conn, database="SQF", schema="ANALYTICS")
137
+ store.bootstrap() # creates tables + 6 analytical views
138
+ store.persist(report)
139
+
140
+ # 4. Query the views
141
+ store.overall_metrics() # headline KPIs
142
+ store.daily_hit_rate() # time series for charts
143
+ store.top_waste(10) # the 10 most expensive duplicate clusters
144
+ store.multi_variant_offenders(10) # same logic, many SQL spellings
145
+ ```
146
+
147
+ The bundled SQL (DDL, views, `QUERY_HISTORY` export) lives in [`sqf/sql/`](sqf/sql/) and is also usable standalone.
148
+
149
+ ---
150
+
151
+ ## Synthetic Workloads & Benchmarks
152
+
153
+ No Snowflake account needed to try the library:
154
+
155
+ ```python
156
+ from sqf import SyntheticWorkloadGenerator, SQFAnalyzer
157
+
158
+ gen = SyntheticWorkloadGenerator(n_queries=1000, duplication_rate=0.7, seed=42)
159
+ report = SQFAnalyzer().ingest(gen.generate()).report()
160
+ print(report.summary()) # → 96.9% dedup hit rate
161
+ ```
162
+
163
+ The generator models 12 logical query families (BI aggregates, joins, window functions, funnels, MRR rollups, …) with 8 syntactic variant dimensions each, plus realistic per-family credit cost distributions.
164
+
165
+ Reproduce the white paper's full benchmark grid (36 configurations, ~5 min):
166
+
167
+ ```bash
168
+ python -m sqf.benchmark --out benchmarks --full
169
+ ```
170
+
171
+ Outputs `benchmarks/results.json` plus five charts:
172
+
173
+ ![Hit rate vs duplication rate](benchmarks/charts/01_hit_rate_vs_dup_rate.png)
174
+
175
+ ---
176
+
177
+ ## Development
178
+
179
+ ```bash
180
+ git clone https://github.com/vermapragya/sqf-py
181
+ cd sqf-py
182
+ python3 -m venv .venv
183
+ .venv/bin/pip install -e ".[dev,bench]"
184
+ .venv/bin/python -m pytest # 68 tests
185
+ ```
186
+
187
+ ---
188
+
189
+ ## License
190
+
191
+ [MIT](LICENSE)
sqf_py-0.1.0/README.md ADDED
@@ -0,0 +1,171 @@
1
+ # sqf-py — Semantic Query Fingerprinting for Snowflake
2
+
3
+ [![Tests](https://img.shields.io/badge/tests-68%20passed-brightgreen)]()
4
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)]()
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)]()
6
+
7
+ **sqf-py** assigns a stable, content-addressed fingerprint to any SQL query by normalizing away syntactic noise. Queries that are *logically identical* but *written differently* collapse to the same fingerprint — enabling deduplication analysis, cost attribution, and query-cache optimization on Snowflake.
8
+
9
+ Accompanies the white paper: [**Semantic Query Deduplication in Cloud Data Warehouses**](WHITEPAPER.md).
10
+
11
+ ---
12
+
13
+ ## The Problem
14
+
15
+ Modern data warehouses are bombarded with semantically identical queries that look different:
16
+
17
+ ```sql
18
+ -- BI tool A (Looker)
19
+ SELECT o.user_id AS uid, SUM(o.amount) AS revenue
20
+ FROM orders AS o WHERE o.status = 'complete' AND o.created_at > '2024-01-01'
21
+ GROUP BY 1
22
+
23
+ -- BI tool B (Tableau)
24
+ SELECT SUM(amount) AS revenue, user_id AS uid
25
+ FROM orders WHERE created_at > '2023-06-01' AND status = 'active'
26
+ GROUP BY uid
27
+ ```
28
+
29
+ These are logically the same query template. Snowflake's text-keyed result cache treats them as distinct — burning compute on every re-execution. sqf-py proves they're duplicates: both fingerprint to `3c1a8c600789df69…`.
30
+
31
+ On a synthetic 10,000-query BI-style workload, sqf-py identifies **99.7% of executions as semantic duplicates**, at **~440 queries/second** analyzed client-side. See [the white paper](WHITEPAPER.md) for methodology and caveats.
32
+
33
+ ---
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install sqf-py # core library (sqlglot only)
39
+ pip install "sqf-py[snowflake]" # + Snowflake connector
40
+ pip install "sqf-py[bench]" # + matplotlib for benchmark charts
41
+ pip install "sqf-py[dev]" # + pytest
42
+ ```
43
+
44
+ ---
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ from sqf import fingerprint, are_equivalent, canonical_form, SQFAnalyzer
50
+
51
+ # Single fingerprint
52
+ fp = fingerprint("SELECT a, b FROM t WHERE id = 1")
53
+ # → "3f4a1b9c..." (64-char hex, stable)
54
+
55
+ # Equivalence check — these two queries are semantically identical
56
+ q1 = "SELECT a AS col1, b AS col2 FROM t WHERE id = 99"
57
+ q2 = "SELECT b, a FROM t WHERE id = 1"
58
+ are_equivalent(q1, q2) # → True
59
+
60
+ # See the canonical form
61
+ canonical_form("SELECT a AS x, b AS y FROM t WHERE id = 42")
62
+ # → "SELECT A, B FROM T WHERE ID = ?"
63
+
64
+ # Bulk workload analysis
65
+ analyzer = SQFAnalyzer()
66
+ analyzer.ingest_sql(my_query_list, credits_per_query=0.05)
67
+ print(analyzer.report().summary())
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Normalization Pipeline
73
+
74
+ The SQF algorithm applies these passes in order:
75
+
76
+ | Pass | What it does | Example |
77
+ |------|-------------|---------|
78
+ | 1. GROUP BY reference resolution | `GROUP BY 1` / `GROUP BY alias` → actual expression | `GROUP BY user_id` |
79
+ | 2. Alias stripping | Remove all `AS` aliases and table qualifiers | `SELECT o.a AS x` → `SELECT a` |
80
+ | 3. Column sort | Sort SELECT list alphabetically | `SELECT b, a` → `SELECT a, b` |
81
+ | 4. GROUP BY sort | Sort GROUP BY keys | `GROUP BY b, a` → `GROUP BY a, b` |
82
+ | 5. Predicate canonicalization | Sort AND/OR operands recursively | `WHERE b=2 AND a=1` → `WHERE a=1 AND b=2` |
83
+ | 6. CTE inlining | Inline single-reference CTEs | `WITH x AS (...) SELECT ... FROM x` → subquery |
84
+ | 7. Literal abstraction | Replace all values with `?` | `WHERE id = 42` → `WHERE id = ?` |
85
+ | 8. Whitespace collapse + uppercase | Canonical string form | |
86
+ | **Hash** | SHA-256 of canonical string | 64-char hex fingerprint |
87
+
88
+ The precise equivalence class (and its deliberate trade-offs) is defined in [§2 of the white paper](WHITEPAPER.md).
89
+
90
+ ---
91
+
92
+ ## Analyzing a Snowflake Workload
93
+
94
+ ```python
95
+ from sqf import SnowflakeIngestor, ClusterStore, SQFAnalyzer
96
+ import snowflake.connector
97
+
98
+ conn = snowflake.connector.connect(...) # your credentials
99
+
100
+ # 1. Pull the last 30 days of QUERY_HISTORY
101
+ records = SnowflakeIngestor(conn, lookback_days=30, row_limit=50_000).fetch_records()
102
+
103
+ # 2. Fingerprint + cluster
104
+ report = SQFAnalyzer().ingest(records).report()
105
+ print(report.summary())
106
+ # ═══════════════════════════════════════════════════════════
107
+ # Semantic Query Fingerprint (SQF) Analysis Report
108
+ # ═══════════════════════════════════════════════════════════
109
+ # Total query executions : 12,847
110
+ # Unique SQF fingerprints : 4,203
111
+ # Dedup hit rate : 67.3%
112
+ # Credits wasted : 86.4800
113
+ # ...
114
+
115
+ # 3. Persist results back to Snowflake (idempotent MERGEs)
116
+ store = ClusterStore(conn, database="SQF", schema="ANALYTICS")
117
+ store.bootstrap() # creates tables + 6 analytical views
118
+ store.persist(report)
119
+
120
+ # 4. Query the views
121
+ store.overall_metrics() # headline KPIs
122
+ store.daily_hit_rate() # time series for charts
123
+ store.top_waste(10) # the 10 most expensive duplicate clusters
124
+ store.multi_variant_offenders(10) # same logic, many SQL spellings
125
+ ```
126
+
127
+ The bundled SQL (DDL, views, `QUERY_HISTORY` export) lives in [`sqf/sql/`](sqf/sql/) and is also usable standalone.
128
+
129
+ ---
130
+
131
+ ## Synthetic Workloads & Benchmarks
132
+
133
+ No Snowflake account needed to try the library:
134
+
135
+ ```python
136
+ from sqf import SyntheticWorkloadGenerator, SQFAnalyzer
137
+
138
+ gen = SyntheticWorkloadGenerator(n_queries=1000, duplication_rate=0.7, seed=42)
139
+ report = SQFAnalyzer().ingest(gen.generate()).report()
140
+ print(report.summary()) # → 96.9% dedup hit rate
141
+ ```
142
+
143
+ The generator models 12 logical query families (BI aggregates, joins, window functions, funnels, MRR rollups, …) with 8 syntactic variant dimensions each, plus realistic per-family credit cost distributions.
144
+
145
+ Reproduce the white paper's full benchmark grid (36 configurations, ~5 min):
146
+
147
+ ```bash
148
+ python -m sqf.benchmark --out benchmarks --full
149
+ ```
150
+
151
+ Outputs `benchmarks/results.json` plus five charts:
152
+
153
+ ![Hit rate vs duplication rate](benchmarks/charts/01_hit_rate_vs_dup_rate.png)
154
+
155
+ ---
156
+
157
+ ## Development
158
+
159
+ ```bash
160
+ git clone https://github.com/vermapragya/sqf-py
161
+ cd sqf-py
162
+ python3 -m venv .venv
163
+ .venv/bin/pip install -e ".[dev,bench]"
164
+ .venv/bin/python -m pytest # 68 tests
165
+ ```
166
+
167
+ ---
168
+
169
+ ## License
170
+
171
+ [MIT](LICENSE)
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sqf-py"
7
+ version = "0.1.0"
8
+ description = "Semantic Query Fingerprinting for Snowflake — collapse syntactically different but logically identical SQL queries to a canonical fingerprint"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.9"
12
+ dependencies = [
13
+ "sqlglot>=25.0.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ dev = ["pytest>=7.0", "pytest-cov"]
18
+ snowflake = ["snowflake-connector-python>=3.0"]
19
+ bench = ["matplotlib>=3.7"]
20
+
21
+ [project.urls]
22
+ Homepage = "https://github.com/vermapragya/sqf-py"
23
+ "White Paper" = "https://github.com/vermapragya/sqf-py/blob/main/WHITEPAPER.md"
24
+
25
+ [tool.setuptools.packages.find]
26
+ include = ["sqf*"]
27
+
28
+ [tool.setuptools.package-data]
29
+ sqf = ["sql/*.sql"]
30
+
31
+ [tool.pytest.ini_options]
32
+ testpaths = ["."]
sqf_py-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,70 @@
1
+ """
2
+ sqf — Semantic Query Fingerprinting for Snowflake
3
+ ==================================================
4
+
5
+ A Python library that assigns a stable, content-addressed fingerprint to any
6
+ SQL query by normalizing away syntactic noise. Queries that are logically
7
+ identical but written differently collapse to the same fingerprint, enabling
8
+ deduplication analysis, cost attribution, and query-cache optimization.
9
+
10
+ Quick start::
11
+
12
+ from sqf import fingerprint, are_equivalent, SQFAnalyzer
13
+
14
+ # Single query fingerprinting
15
+ h = fingerprint("SELECT a, b FROM t WHERE id = 1")
16
+
17
+ # Equivalence check
18
+ are_equivalent(
19
+ "SELECT a AS col1, b AS col2 FROM t WHERE id = 1",
20
+ "SELECT b, a FROM t WHERE id = ?",
21
+ ) # → True
22
+
23
+ # Bulk workload analysis
24
+ analyzer = SQFAnalyzer()
25
+ analyzer.ingest_sql(my_query_list)
26
+ print(analyzer.report().summary())
27
+ """
28
+
29
+ from .fingerprint import (
30
+ fingerprint,
31
+ canonical_form,
32
+ are_equivalent,
33
+ QueryRecord,
34
+ SQFCluster,
35
+ )
36
+ from .normalizer import normalize
37
+ from .analyzer import SQFAnalyzer, SQFReport
38
+ from .generator import SyntheticWorkloadGenerator, FAMILIES, FAMILY_BY_ID
39
+ from .snowflake import SnowflakeIngestor, ClusterStore, load_sql, SQL_FILES
40
+ from .benchmark import (
41
+ BenchmarkRun,
42
+ BenchmarkSuite,
43
+ run_single,
44
+ run_benchmark_suite,
45
+ make_charts,
46
+ )
47
+
48
+ __version__ = "0.1.0"
49
+ __all__ = [
50
+ "fingerprint",
51
+ "canonical_form",
52
+ "are_equivalent",
53
+ "normalize",
54
+ "QueryRecord",
55
+ "SQFCluster",
56
+ "SQFAnalyzer",
57
+ "SQFReport",
58
+ "SyntheticWorkloadGenerator",
59
+ "FAMILIES",
60
+ "FAMILY_BY_ID",
61
+ "SnowflakeIngestor",
62
+ "ClusterStore",
63
+ "load_sql",
64
+ "SQL_FILES",
65
+ "BenchmarkRun",
66
+ "BenchmarkSuite",
67
+ "run_single",
68
+ "run_benchmark_suite",
69
+ "make_charts",
70
+ ]