PyPI - SQaLe - Versions diffs - 0.1.0__tar.gz - Mend

SQaLe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

sqale-0.1.0/LICENSE +21 -0
sqale-0.1.0/PKG-INFO +36 -0
sqale-0.1.0/README.md +1 -0
sqale-0.1.0/SQaLe.egg-info/PKG-INFO +36 -0
sqale-0.1.0/SQaLe.egg-info/SOURCES.txt +13 -0
sqale-0.1.0/SQaLe.egg-info/dependency_links.txt +1 -0
sqale-0.1.0/SQaLe.egg-info/entry_points.txt +2 -0
sqale-0.1.0/SQaLe.egg-info/requires.txt +4 -0
sqale-0.1.0/SQaLe.egg-info/top_level.txt +1 -0
sqale-0.1.0/pyproject.toml +24 -0
sqale-0.1.0/setup.cfg +4 -0
sqale-0.1.0/sqale/__init__.py +3 -0
sqale-0.1.0/sqale/deserialize.py +269 -0
sqale-0.1.0/tests/test_cli.py +78 -0
sqale-0.1.0/tests/test_import.py +95 -0

sqale-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 TRL Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

sqale-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,36 @@
+Metadata-Version: 2.4
+Name: SQaLe
+Version: 0.1.0
+Summary: Deserialize the SQaLe dataset into populated SQLite databases.
+License: MIT License
+        Copyright (c) 2026 TRL Lab
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas
+Requires-Dist: tqdm
+Requires-Dist: pyarrow
+Requires-Dist: datasets
+Dynamic: license-file
+# SQaLe-Library

sqale-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ # SQaLe-Library

sqale-0.1.0/SQaLe.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,36 @@
+Metadata-Version: 2.4
+Name: SQaLe
+Version: 0.1.0
+Summary: Deserialize the SQaLe dataset into populated SQLite databases.
+License: MIT License
+        Copyright (c) 2026 TRL Lab
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas
+Requires-Dist: tqdm
+Requires-Dist: pyarrow
+Requires-Dist: datasets
+Dynamic: license-file
+# SQaLe-Library

sqale-0.1.0/SQaLe.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+LICENSE
+README.md
+pyproject.toml
+SQaLe.egg-info/PKG-INFO
+SQaLe.egg-info/SOURCES.txt
+SQaLe.egg-info/dependency_links.txt
+SQaLe.egg-info/entry_points.txt
+SQaLe.egg-info/requires.txt
+SQaLe.egg-info/top_level.txt
+sqale/__init__.py
+sqale/deserialize.py
+tests/test_cli.py
+tests/test_import.py

sqale-0.1.0/SQaLe.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

sqale-0.1.0/SQaLe.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ sqale-extract = sqale.deserialize:main

sqale-0.1.0/SQaLe.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+pandas
+tqdm
+pyarrow
+datasets

sqale-0.1.0/SQaLe.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ sqale

sqale-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "SQaLe"
+version = "0.1.0"
+description = "Deserialize the SQaLe dataset into populated SQLite databases."
+readme = "README.md"
+license = { file = "LICENSE" }
+requires-python = ">=3.9"
+dependencies = [
+    "pandas",
+    "tqdm",
+    "pyarrow",         # needed by pandas for parquet support
+    "datasets",        # HuggingFace datasets (also handles .arrow files)
+]
+[project.scripts]
+sqale-extract = "sqale.deserialize:main"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["sqale*"]

sqale-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

sqale-0.1.0/sqale/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .deserialize import deserialize_sqale
+__all__ = ["deserialize_sqale"]

sqale-0.1.0/sqale/deserialize.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""
+Deserialize the SQaLe dataset (cwolff/whatever_100k) into SQLite .db files.
+Each unique schema in the dataset is materialized as a .db file populated
+with the synthetic data stored in the 'Schema content' column.
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sqlite3
+from pathlib import Path
+from typing import Optional
+import pandas as pd
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Core deserialization
+# ---------------------------------------------------------------------------
+def deserialize_sqale(
+    file_path: str,
+    output_dir: str = "deserialized_dbs",
+    limit: Optional[int] = None,
+) -> list[dict]:
+    """
+    Load the SQaLe dataset, deduplicate by schema_id, and materialize each
+    unique schema as a populated SQLite .db file.
+    Parameters
+    ----------
+    file_path:
+        Path to a local parquet/arrow file, a directory of such files, or a
+        HuggingFace dataset repo ID (e.g. 'cwolff/whatever_100k').
+    output_dir:
+        Directory where the .db files will be written (created if missing).
+    limit:
+        Maximum number of unique schemas to process.  None means process all.
+    Returns
+    -------
+    list of dicts, each containing:
+        schema_id     – original schema id from the dataset
+        db_path       – absolute path to the created .db file
+        tables        – list of table names found in the DDL
+        rows_per_table – dict mapping table_name → number of rows inserted
+        error         – None on success, error message string on failure
+    """
+    df = _load_dataset(file_path)
+    # Keep only the first occurrence of each schema_id to avoid duplicate work
+    df = df.drop_duplicates(subset=["schema id"])
+    if limit is not None:
+        df = df.iloc[:limit]
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    results = []
+    for _, row in tqdm(df.iterrows(), total=len(df), desc="Schemas"):
+        schema_id = str(row.get("schema id") or "unknown")
+        full_schema = row.get("Full schema") or ""
+        schema_content_raw = row.get("Schema content") or "{}"
+        schema_content = _parse_schema_content(schema_content_raw)
+        safe_id = re.sub(r"[^\w\-]", "_", schema_id)
+        db_path = out / f"{safe_id}.db"
+        try:
+            rows_per_table = _materialize_db(db_path, full_schema, schema_content)
+            error = None
+        except Exception as exc:
+            rows_per_table = {}
+            error = str(exc)
+        results.append({
+            "schema_id": schema_id,
+            "db_path": str(db_path.resolve()),
+            "tables": list(rows_per_table.keys()),
+            "rows_per_table": rows_per_table,
+            "error": error,
+        })
+    return results
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _parse_schema_content(raw) -> dict[str, list[dict]]:
+    """Parse the Schema content field into a dict[table → list[row]]."""
+    if isinstance(raw, dict):
+        return raw
+    if isinstance(raw, str) and raw:
+        try:
+            parsed = json.loads(raw)
+            if isinstance(parsed, dict):
+                return parsed
+        except (json.JSONDecodeError, TypeError):
+            pass
+    return {}
+def _load_dataset(file_path: str) -> pd.DataFrame:
+    """Load the dataset from a local file/directory or a HuggingFace repo ID."""
+    p = Path(file_path)
+    if p.exists():
+        if p.is_dir():
+            frames = []
+            for ext in ("*.parquet", "*.arrow"):
+                for f in sorted(p.glob(ext)):
+                    frames.append(_read_single_file(f))
+            if not frames:
+                raise FileNotFoundError(f"No parquet/arrow files found in {p}")
+            return pd.concat(frames, ignore_index=True)
+        return _read_single_file(p)
+    # Fall back to HuggingFace
+    try:
+        from datasets import load_dataset  # type: ignore
+        ds = load_dataset(file_path, split="train")
+        return ds.to_pandas()
+    except Exception as exc:
+        raise ValueError(
+            f"Could not load dataset from '{file_path}': {exc}"
+        ) from exc
+def _read_single_file(path: Path) -> pd.DataFrame:
+    if path.suffix == ".parquet":
+        return pd.read_parquet(str(path))
+    if path.suffix == ".arrow":
+        from datasets import Dataset  # type: ignore
+        return Dataset.from_file(str(path)).to_pandas()
+    raise ValueError(f"Unsupported file type: {path.suffix}")
+def _split_ddl(ddl: str) -> list[str]:
+    """Split a DDL string into individual statements."""
+    return [s.strip() for s in ddl.split(";") if s.strip()]
+def _materialize_db(
+    db_path: Path,
+    ddl: str,
+    schema_content: dict[str, list[dict]],
+) -> dict[str, int]:
+    """
+    Create a SQLite database at *db_path*, execute the DDL to build the
+    schema, then insert all rows from *schema_content*.
+    Returns a mapping of table_name → number of rows inserted.
+    """
+    if db_path.exists():
+        db_path.unlink()
+    conn = sqlite3.connect(str(db_path))
+    try:
+        conn.execute("PRAGMA foreign_keys = OFF")
+        conn.execute("PRAGMA journal_mode = WAL")
+        for stmt in _split_ddl(ddl):
+            try:
+                conn.execute(stmt)
+            except sqlite3.Error:
+                pass  # Ignore unsupported syntax / duplicate table errors
+        rows_per_table: dict[str, int] = {}
+        for table, table_rows in schema_content.items():
+            if not isinstance(table_rows, list):
+                try:
+                    table_rows = list(table_rows)
+                except TypeError:
+                    rows_per_table[table] = 0
+                    continue
+            table_rows = [
+                dict(r) if not isinstance(r, dict) else r
+                for r in table_rows
+            ]
+            if len(table_rows) == 0:
+                rows_per_table[table] = 0
+                continue
+            cols = list(table_rows[0].keys())
+            col_list = ", ".join(f'"{c}"' for c in cols)
+            placeholders = ", ".join("?" * len(cols))
+            insert_sql = (
+                f'INSERT OR IGNORE INTO "{table}" ({col_list}) VALUES ({placeholders})'
+            )
+            inserted = 0
+            for row_dict in table_rows:
+                values = _coerce_row(row_dict, cols)
+                try:
+                    conn.execute(insert_sql, values)
+                    inserted += 1
+                except sqlite3.Error:
+                    pass
+            rows_per_table[table] = inserted
+        conn.commit()
+    finally:
+        conn.close()
+    return rows_per_table
+def _coerce_row(row_dict: dict, cols: list[str]) -> list:
+    """Clamp numeric values to SQLite-safe ranges and return an ordered list."""
+    values = []
+    for c in cols:
+        val = row_dict.get(c)
+        if isinstance(val, int):
+            val = max(-9223372036854775808, min(9223372036854775807, val))
+        elif isinstance(val, float):
+            val = max(-1.7976931348623157e+308, min(1.7976931348623157e+308, val))
+        values.append(val)
+    return values
+# ---------------------------------------------------------------------------
+# CLI entry-point
+# ---------------------------------------------------------------------------
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Deserialize the SQaLe dataset into SQLite .db files."
+    )
+    p.add_argument(
+        "--input",
+        required=True,
+        help="Local parquet/arrow file, directory, or HuggingFace repo ID.",
+    )
+    p.add_argument(
+        "--output",
+        default="deserialized_dbs",
+        help="Output directory for .db files (default: deserialized_dbs).",
+    )
+    p.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Maximum number of unique schemas to process.",
+    )
+    return p.parse_args()
+def main() -> None:
+    args = _parse_args()
+    results = deserialize_sqale(
+        file_path=args.input,
+        output_dir=args.output,
+        limit=args.limit,
+    )
+    failures = [r for r in results if r["error"]]
+    successes = len(results) - len(failures)
+    total_rows = sum(sum(r["rows_per_table"].values()) for r in results)
+    print(
+        f"Done: {successes}/{len(results)} succeeded, {total_rows:,} rows total."
+    )
+    for r in failures:
+        print(f"  FAIL {r['schema_id']}: {r['error']}")

sqale-0.1.0/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""
+Tests for the sqale-extract CLI entry point.
+Run with:  pytest tests/test_cli.py -v
+"""
+import subprocess
+import sys
+from pathlib import Path
+def run_cli(*args: str) -> subprocess.CompletedProcess:
+    """Run sqale-extract via the installed console script."""
+    return subprocess.run(
+        ["sqale-extract", *args],
+        capture_output=True,
+        text=True,
+    )
+def run_cli_module(*args: str) -> subprocess.CompletedProcess:
+    """Fallback: run sqale.deserialize as a module (works without install)."""
+    return subprocess.run(
+        [sys.executable, "-m", "sqale.deserialize", *args],
+        capture_output=True,
+        text=True,
+    )
+def test_cli_missing_input():
+    result = run_cli()
+    assert result.returncode != 0
+    assert "required" in result.stderr.lower() or "error" in result.stderr.lower()
+def test_cli_basic(sample_parquet, output_dir):
+    result = run_cli(
+        "--input", str(sample_parquet),
+        "--output", str(output_dir),
+    )
+    assert result.returncode == 0, f"CLI failed:\n{result.stderr}"
+    assert "Done:" in result.stdout
+    assert "2/2" in result.stdout
+def test_cli_limit(sample_parquet, output_dir):
+    result = run_cli(
+        "--input", str(sample_parquet),
+        "--output", str(output_dir),
+        "--limit", "1",
+    )
+    assert result.returncode == 0, f"CLI failed:\n{result.stderr}"
+    assert "1/1" in result.stdout
+def test_cli_creates_db_files(sample_parquet, output_dir):
+    run_cli(
+        "--input", str(sample_parquet),
+        "--output", str(output_dir),
+    )
+    db_files = list(output_dir.glob("*.db"))
+    assert len(db_files) == 2, f"Expected 2 .db files, found: {db_files}"
+def test_cli_invalid_input(output_dir):
+    result = run_cli(
+        "--input", "/nonexistent/path/data.parquet",
+        "--output", str(output_dir),
+    )
+    assert result.returncode != 0
+def test_cli_help():
+    result = run_cli("--help")
+    assert result.returncode == 0
+    assert "--input" in result.stdout
+    assert "--output" in result.stdout
+    assert "--limit" in result.stdout

sqale-0.1.0/tests/test_import.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Tests for the sqale Python API (import usage).
+Run with:  pytest tests/test_import.py -v
+"""
+import sqlite3
+from pathlib import Path
+import pytest
+from sqale import deserialize_sqale
+def test_basic_deserialization(sample_parquet, output_dir):
+    results = deserialize_sqale(
+        file_path=str(sample_parquet),
+        output_dir=str(output_dir),
+    )
+    assert len(results) == 2, "Should produce one result per unique schema"
+    for r in results:
+        assert r["error"] is None, f"Unexpected error for {r['schema_id']}: {r['error']}"
+        assert Path(r["db_path"]).exists(), f".db file not created: {r['db_path']}"
+def test_rows_inserted(sample_parquet, output_dir):
+    results = deserialize_sqale(
+        file_path=str(sample_parquet),
+        output_dir=str(output_dir),
+    )
+    schema_001 = next(r for r in results if r["schema_id"] == "schema_001")
+    assert schema_001["rows_per_table"].get("users") == 2
+    assert schema_001["rows_per_table"].get("orders") == 2
+def test_db_is_queryable(sample_parquet, output_dir):
+    results = deserialize_sqale(
+        file_path=str(sample_parquet),
+        output_dir=str(output_dir),
+    )
+    schema_001 = next(r for r in results if r["schema_id"] == "schema_001")
+    conn = sqlite3.connect(schema_001["db_path"])
+    rows = conn.execute("SELECT name FROM users ORDER BY id").fetchall()
+    conn.close()
+    assert rows == [("Alice",), ("Bob",)]
+def test_limit_parameter(sample_parquet, output_dir):
+    results = deserialize_sqale(
+        file_path=str(sample_parquet),
+        output_dir=str(output_dir),
+        limit=1,
+    )
+    assert len(results) == 1, "limit=1 should produce only one result"
+def test_output_dir_created(tmp_path, sample_parquet):
+    new_dir = tmp_path / "brand_new_dir"
+    assert not new_dir.exists()
+    deserialize_sqale(file_path=str(sample_parquet), output_dir=str(new_dir))
+    assert new_dir.exists()
+def test_deduplication(tmp_path, output_dir):
+    """Rows with the same schema_id should be deduplicated."""
+    import json
+    import pandas as pd
+    df = pd.DataFrame(
+        [
+            {
+                "schema id": "dup_schema",
+                "Full schema": "CREATE TABLE t (id INTEGER PRIMARY KEY)",
+                "Schema content": json.dumps({"t": [{"id": 1}]}),
+            },
+            {
+                "schema id": "dup_schema",  # duplicate
+                "Full schema": "CREATE TABLE t (id INTEGER PRIMARY KEY)",
+                "Schema content": json.dumps({"t": [{"id": 2}]}),
+            },
+        ]
+    )
+    pq = tmp_path / "dup.parquet"
+    df.to_parquet(str(pq), index=False)
+    results = deserialize_sqale(file_path=str(pq), output_dir=str(output_dir))
+    assert len(results) == 1, "Duplicate schema_ids should be deduplicated"