SQaLe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqale-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 TRL Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sqale-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,36 @@
1
+ Metadata-Version: 2.4
2
+ Name: SQaLe
3
+ Version: 0.1.0
4
+ Summary: Deserialize the SQaLe dataset into populated SQLite databases.
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 TRL Lab
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Requires-Python: >=3.9
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: pandas
31
+ Requires-Dist: tqdm
32
+ Requires-Dist: pyarrow
33
+ Requires-Dist: datasets
34
+ Dynamic: license-file
35
+
36
+ # SQaLe-Library
sqale-0.1.0/README.md ADDED
@@ -0,0 +1 @@
1
+ # SQaLe-Library
@@ -0,0 +1,36 @@
1
+ Metadata-Version: 2.4
2
+ Name: SQaLe
3
+ Version: 0.1.0
4
+ Summary: Deserialize the SQaLe dataset into populated SQLite databases.
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 TRL Lab
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Requires-Python: >=3.9
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: pandas
31
+ Requires-Dist: tqdm
32
+ Requires-Dist: pyarrow
33
+ Requires-Dist: datasets
34
+ Dynamic: license-file
35
+
36
+ # SQaLe-Library
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ SQaLe.egg-info/PKG-INFO
5
+ SQaLe.egg-info/SOURCES.txt
6
+ SQaLe.egg-info/dependency_links.txt
7
+ SQaLe.egg-info/entry_points.txt
8
+ SQaLe.egg-info/requires.txt
9
+ SQaLe.egg-info/top_level.txt
10
+ sqale/__init__.py
11
+ sqale/deserialize.py
12
+ tests/test_cli.py
13
+ tests/test_import.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sqale-extract = sqale.deserialize:main
@@ -0,0 +1,4 @@
1
+ pandas
2
+ tqdm
3
+ pyarrow
4
+ datasets
@@ -0,0 +1 @@
1
+ sqale
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "SQaLe"
7
+ version = "0.1.0"
8
+ description = "Deserialize the SQaLe dataset into populated SQLite databases."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.9"
12
+ dependencies = [
13
+ "pandas",
14
+ "tqdm",
15
+ "pyarrow", # needed by pandas for parquet support
16
+ "datasets", # HuggingFace datasets (also handles .arrow files)
17
+ ]
18
+
19
+ [project.scripts]
20
+ sqale-extract = "sqale.deserialize:main"
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["."]
24
+ include = ["sqale*"]
sqale-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from .deserialize import deserialize_sqale
2
+
3
+ __all__ = ["deserialize_sqale"]
@@ -0,0 +1,269 @@
1
+ """
2
+ Deserialize the SQaLe dataset (cwolff/whatever_100k) into SQLite .db files.
3
+
4
+ Each unique schema in the dataset is materialized as a .db file populated
5
+ with the synthetic data stored in the 'Schema content' column.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import re
13
+ import sqlite3
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ import pandas as pd
18
+ from tqdm import tqdm
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Core deserialization
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def deserialize_sqale(
26
+ file_path: str,
27
+ output_dir: str = "deserialized_dbs",
28
+ limit: Optional[int] = None,
29
+ ) -> list[dict]:
30
+ """
31
+ Load the SQaLe dataset, deduplicate by schema_id, and materialize each
32
+ unique schema as a populated SQLite .db file.
33
+
34
+ Parameters
35
+ ----------
36
+ file_path:
37
+ Path to a local parquet/arrow file, a directory of such files, or a
38
+ HuggingFace dataset repo ID (e.g. 'cwolff/whatever_100k').
39
+ output_dir:
40
+ Directory where the .db files will be written (created if missing).
41
+ limit:
42
+ Maximum number of unique schemas to process. None means process all.
43
+
44
+ Returns
45
+ -------
46
+ list of dicts, each containing:
47
+ schema_id – original schema id from the dataset
48
+ db_path – absolute path to the created .db file
49
+ tables – list of table names found in the DDL
50
+ rows_per_table – dict mapping table_name → number of rows inserted
51
+ error – None on success, error message string on failure
52
+ """
53
+ df = _load_dataset(file_path)
54
+
55
+ # Keep only the first occurrence of each schema_id to avoid duplicate work
56
+ df = df.drop_duplicates(subset=["schema id"])
57
+ if limit is not None:
58
+ df = df.iloc[:limit]
59
+
60
+ out = Path(output_dir)
61
+ out.mkdir(parents=True, exist_ok=True)
62
+
63
+ results = []
64
+ for _, row in tqdm(df.iterrows(), total=len(df), desc="Schemas"):
65
+ schema_id = str(row.get("schema id") or "unknown")
66
+ full_schema = row.get("Full schema") or ""
67
+ schema_content_raw = row.get("Schema content") or "{}"
68
+
69
+ schema_content = _parse_schema_content(schema_content_raw)
70
+
71
+ safe_id = re.sub(r"[^\w\-]", "_", schema_id)
72
+ db_path = out / f"{safe_id}.db"
73
+
74
+ try:
75
+ rows_per_table = _materialize_db(db_path, full_schema, schema_content)
76
+ error = None
77
+ except Exception as exc:
78
+ rows_per_table = {}
79
+ error = str(exc)
80
+
81
+ results.append({
82
+ "schema_id": schema_id,
83
+ "db_path": str(db_path.resolve()),
84
+ "tables": list(rows_per_table.keys()),
85
+ "rows_per_table": rows_per_table,
86
+ "error": error,
87
+ })
88
+
89
+ return results
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Internal helpers
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def _parse_schema_content(raw) -> dict[str, list[dict]]:
97
+ """Parse the Schema content field into a dict[table → list[row]]."""
98
+ if isinstance(raw, dict):
99
+ return raw
100
+ if isinstance(raw, str) and raw:
101
+ try:
102
+ parsed = json.loads(raw)
103
+ if isinstance(parsed, dict):
104
+ return parsed
105
+ except (json.JSONDecodeError, TypeError):
106
+ pass
107
+ return {}
108
+
109
+
110
+ def _load_dataset(file_path: str) -> pd.DataFrame:
111
+ """Load the dataset from a local file/directory or a HuggingFace repo ID."""
112
+ p = Path(file_path)
113
+ if p.exists():
114
+ if p.is_dir():
115
+ frames = []
116
+ for ext in ("*.parquet", "*.arrow"):
117
+ for f in sorted(p.glob(ext)):
118
+ frames.append(_read_single_file(f))
119
+ if not frames:
120
+ raise FileNotFoundError(f"No parquet/arrow files found in {p}")
121
+ return pd.concat(frames, ignore_index=True)
122
+ return _read_single_file(p)
123
+
124
+ # Fall back to HuggingFace
125
+ try:
126
+ from datasets import load_dataset # type: ignore
127
+ ds = load_dataset(file_path, split="train")
128
+ return ds.to_pandas()
129
+ except Exception as exc:
130
+ raise ValueError(
131
+ f"Could not load dataset from '{file_path}': {exc}"
132
+ ) from exc
133
+
134
+
135
+ def _read_single_file(path: Path) -> pd.DataFrame:
136
+ if path.suffix == ".parquet":
137
+ return pd.read_parquet(str(path))
138
+ if path.suffix == ".arrow":
139
+ from datasets import Dataset # type: ignore
140
+ return Dataset.from_file(str(path)).to_pandas()
141
+ raise ValueError(f"Unsupported file type: {path.suffix}")
142
+
143
+
144
+ def _split_ddl(ddl: str) -> list[str]:
145
+ """Split a DDL string into individual statements."""
146
+ return [s.strip() for s in ddl.split(";") if s.strip()]
147
+
148
+
149
+ def _materialize_db(
150
+ db_path: Path,
151
+ ddl: str,
152
+ schema_content: dict[str, list[dict]],
153
+ ) -> dict[str, int]:
154
+ """
155
+ Create a SQLite database at *db_path*, execute the DDL to build the
156
+ schema, then insert all rows from *schema_content*.
157
+
158
+ Returns a mapping of table_name → number of rows inserted.
159
+ """
160
+ if db_path.exists():
161
+ db_path.unlink()
162
+
163
+ conn = sqlite3.connect(str(db_path))
164
+ try:
165
+ conn.execute("PRAGMA foreign_keys = OFF")
166
+ conn.execute("PRAGMA journal_mode = WAL")
167
+
168
+ for stmt in _split_ddl(ddl):
169
+ try:
170
+ conn.execute(stmt)
171
+ except sqlite3.Error:
172
+ pass # Ignore unsupported syntax / duplicate table errors
173
+
174
+ rows_per_table: dict[str, int] = {}
175
+ for table, table_rows in schema_content.items():
176
+ if not isinstance(table_rows, list):
177
+ try:
178
+ table_rows = list(table_rows)
179
+ except TypeError:
180
+ rows_per_table[table] = 0
181
+ continue
182
+ table_rows = [
183
+ dict(r) if not isinstance(r, dict) else r
184
+ for r in table_rows
185
+ ]
186
+ if len(table_rows) == 0:
187
+ rows_per_table[table] = 0
188
+ continue
189
+
190
+ cols = list(table_rows[0].keys())
191
+ col_list = ", ".join(f'"{c}"' for c in cols)
192
+ placeholders = ", ".join("?" * len(cols))
193
+ insert_sql = (
194
+ f'INSERT OR IGNORE INTO "{table}" ({col_list}) VALUES ({placeholders})'
195
+ )
196
+
197
+ inserted = 0
198
+ for row_dict in table_rows:
199
+ values = _coerce_row(row_dict, cols)
200
+ try:
201
+ conn.execute(insert_sql, values)
202
+ inserted += 1
203
+ except sqlite3.Error:
204
+ pass
205
+
206
+ rows_per_table[table] = inserted
207
+
208
+ conn.commit()
209
+ finally:
210
+ conn.close()
211
+
212
+ return rows_per_table
213
+
214
+
215
+ def _coerce_row(row_dict: dict, cols: list[str]) -> list:
216
+ """Clamp numeric values to SQLite-safe ranges and return an ordered list."""
217
+ values = []
218
+ for c in cols:
219
+ val = row_dict.get(c)
220
+ if isinstance(val, int):
221
+ val = max(-9223372036854775808, min(9223372036854775807, val))
222
+ elif isinstance(val, float):
223
+ val = max(-1.7976931348623157e+308, min(1.7976931348623157e+308, val))
224
+ values.append(val)
225
+ return values
226
+
227
+
228
+ # ---------------------------------------------------------------------------
229
+ # CLI entry-point
230
+ # ---------------------------------------------------------------------------
231
+
232
+ def _parse_args() -> argparse.Namespace:
233
+ p = argparse.ArgumentParser(
234
+ description="Deserialize the SQaLe dataset into SQLite .db files."
235
+ )
236
+ p.add_argument(
237
+ "--input",
238
+ required=True,
239
+ help="Local parquet/arrow file, directory, or HuggingFace repo ID.",
240
+ )
241
+ p.add_argument(
242
+ "--output",
243
+ default="deserialized_dbs",
244
+ help="Output directory for .db files (default: deserialized_dbs).",
245
+ )
246
+ p.add_argument(
247
+ "--limit",
248
+ type=int,
249
+ default=None,
250
+ help="Maximum number of unique schemas to process.",
251
+ )
252
+ return p.parse_args()
253
+
254
+
255
+ def main() -> None:
256
+ args = _parse_args()
257
+ results = deserialize_sqale(
258
+ file_path=args.input,
259
+ output_dir=args.output,
260
+ limit=args.limit,
261
+ )
262
+ failures = [r for r in results if r["error"]]
263
+ successes = len(results) - len(failures)
264
+ total_rows = sum(sum(r["rows_per_table"].values()) for r in results)
265
+ print(
266
+ f"Done: {successes}/{len(results)} succeeded, {total_rows:,} rows total."
267
+ )
268
+ for r in failures:
269
+ print(f" FAIL {r['schema_id']}: {r['error']}")
@@ -0,0 +1,78 @@
1
+ """
2
+ Tests for the sqale-extract CLI entry point.
3
+
4
+ Run with: pytest tests/test_cli.py -v
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ from pathlib import Path
10
+
11
+
12
+ def run_cli(*args: str) -> subprocess.CompletedProcess:
13
+ """Run sqale-extract via the installed console script."""
14
+ return subprocess.run(
15
+ ["sqale-extract", *args],
16
+ capture_output=True,
17
+ text=True,
18
+ )
19
+
20
+
21
+ def run_cli_module(*args: str) -> subprocess.CompletedProcess:
22
+ """Fallback: run sqale.deserialize as a module (works without install)."""
23
+ return subprocess.run(
24
+ [sys.executable, "-m", "sqale.deserialize", *args],
25
+ capture_output=True,
26
+ text=True,
27
+ )
28
+
29
+
30
+ def test_cli_missing_input():
31
+ result = run_cli()
32
+ assert result.returncode != 0
33
+ assert "required" in result.stderr.lower() or "error" in result.stderr.lower()
34
+
35
+
36
+ def test_cli_basic(sample_parquet, output_dir):
37
+ result = run_cli(
38
+ "--input", str(sample_parquet),
39
+ "--output", str(output_dir),
40
+ )
41
+ assert result.returncode == 0, f"CLI failed:\n{result.stderr}"
42
+ assert "Done:" in result.stdout
43
+ assert "2/2" in result.stdout
44
+
45
+
46
+ def test_cli_limit(sample_parquet, output_dir):
47
+ result = run_cli(
48
+ "--input", str(sample_parquet),
49
+ "--output", str(output_dir),
50
+ "--limit", "1",
51
+ )
52
+ assert result.returncode == 0, f"CLI failed:\n{result.stderr}"
53
+ assert "1/1" in result.stdout
54
+
55
+
56
+ def test_cli_creates_db_files(sample_parquet, output_dir):
57
+ run_cli(
58
+ "--input", str(sample_parquet),
59
+ "--output", str(output_dir),
60
+ )
61
+ db_files = list(output_dir.glob("*.db"))
62
+ assert len(db_files) == 2, f"Expected 2 .db files, found: {db_files}"
63
+
64
+
65
+ def test_cli_invalid_input(output_dir):
66
+ result = run_cli(
67
+ "--input", "/nonexistent/path/data.parquet",
68
+ "--output", str(output_dir),
69
+ )
70
+ assert result.returncode != 0
71
+
72
+
73
+ def test_cli_help():
74
+ result = run_cli("--help")
75
+ assert result.returncode == 0
76
+ assert "--input" in result.stdout
77
+ assert "--output" in result.stdout
78
+ assert "--limit" in result.stdout
@@ -0,0 +1,95 @@
1
+ """
2
+ Tests for the sqale Python API (import usage).
3
+
4
+ Run with: pytest tests/test_import.py -v
5
+ """
6
+
7
+ import sqlite3
8
+ from pathlib import Path
9
+
10
+ import pytest
11
+
12
+ from sqale import deserialize_sqale
13
+
14
+
15
+ def test_basic_deserialization(sample_parquet, output_dir):
16
+ results = deserialize_sqale(
17
+ file_path=str(sample_parquet),
18
+ output_dir=str(output_dir),
19
+ )
20
+
21
+ assert len(results) == 2, "Should produce one result per unique schema"
22
+ for r in results:
23
+ assert r["error"] is None, f"Unexpected error for {r['schema_id']}: {r['error']}"
24
+ assert Path(r["db_path"]).exists(), f".db file not created: {r['db_path']}"
25
+
26
+
27
+ def test_rows_inserted(sample_parquet, output_dir):
28
+ results = deserialize_sqale(
29
+ file_path=str(sample_parquet),
30
+ output_dir=str(output_dir),
31
+ )
32
+
33
+ schema_001 = next(r for r in results if r["schema_id"] == "schema_001")
34
+ assert schema_001["rows_per_table"].get("users") == 2
35
+ assert schema_001["rows_per_table"].get("orders") == 2
36
+
37
+
38
+ def test_db_is_queryable(sample_parquet, output_dir):
39
+ results = deserialize_sqale(
40
+ file_path=str(sample_parquet),
41
+ output_dir=str(output_dir),
42
+ )
43
+
44
+ schema_001 = next(r for r in results if r["schema_id"] == "schema_001")
45
+ conn = sqlite3.connect(schema_001["db_path"])
46
+ rows = conn.execute("SELECT name FROM users ORDER BY id").fetchall()
47
+ conn.close()
48
+
49
+ assert rows == [("Alice",), ("Bob",)]
50
+
51
+
52
+ def test_limit_parameter(sample_parquet, output_dir):
53
+ results = deserialize_sqale(
54
+ file_path=str(sample_parquet),
55
+ output_dir=str(output_dir),
56
+ limit=1,
57
+ )
58
+
59
+ assert len(results) == 1, "limit=1 should produce only one result"
60
+
61
+
62
+ def test_output_dir_created(tmp_path, sample_parquet):
63
+ new_dir = tmp_path / "brand_new_dir"
64
+ assert not new_dir.exists()
65
+
66
+ deserialize_sqale(file_path=str(sample_parquet), output_dir=str(new_dir))
67
+
68
+ assert new_dir.exists()
69
+
70
+
71
+ def test_deduplication(tmp_path, output_dir):
72
+ """Rows with the same schema_id should be deduplicated."""
73
+ import json
74
+ import pandas as pd
75
+
76
+ df = pd.DataFrame(
77
+ [
78
+ {
79
+ "schema id": "dup_schema",
80
+ "Full schema": "CREATE TABLE t (id INTEGER PRIMARY KEY)",
81
+ "Schema content": json.dumps({"t": [{"id": 1}]}),
82
+ },
83
+ {
84
+ "schema id": "dup_schema", # duplicate
85
+ "Full schema": "CREATE TABLE t (id INTEGER PRIMARY KEY)",
86
+ "Schema content": json.dumps({"t": [{"id": 2}]}),
87
+ },
88
+ ]
89
+ )
90
+ pq = tmp_path / "dup.parquet"
91
+ df.to_parquet(str(pq), index=False)
92
+
93
+ results = deserialize_sqale(file_path=str(pq), output_dir=str(output_dir))
94
+
95
+ assert len(results) == 1, "Duplicate schema_ids should be deduplicated"