PyPI - scduck - Versions diffs - 0.1.0__tar.gz - Mend

scduck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scduck-0.1.0/.github/workflows/publish.yml +46 -0
scduck-0.1.0/.github/workflows/test.yml +29 -0
scduck-0.1.0/.gitignore +30 -0
scduck-0.1.0/.python-version +1 -0
scduck-0.1.0/PKG-INFO +115 -0
scduck-0.1.0/README.md +82 -0
scduck-0.1.0/SYNC_LOGIC.md +114 -0
scduck-0.1.0/pyproject.toml +46 -0
scduck-0.1.0/scduck/__init__.py +10 -0
scduck-0.1.0/scduck/table.py +419 -0
scduck-0.1.0/tests/__init__.py +1 -0
scduck-0.1.0/tests/conftest.py +56 -0
scduck-0.1.0/tests/test_basic.py +176 -0
scduck-0.1.0/tests/test_input_types.py +193 -0
scduck-0.1.0/tests/test_sync_cases.py +341 -0
scduck-0.1.0/uv.lock +562 -0

scduck-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,46 @@
+name: Publish to PyPI
+on:
+  push:
+    tags:
+      - "v*"
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+      - name: Set up Python
+        run: uv python install 3.12
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: Run tests
+        run: uv run pytest -v
+  publish:
+    needs: test
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      id-token: write  # Required for trusted publishing
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+      - name: Set up Python
+        run: uv python install 3.12
+      - name: Build package
+        run: uv build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

scduck-0.1.0/.github/workflows/test.yml ADDED Viewed

@@ -0,0 +1,29 @@
+name: Test
+on:
+  push:
+    branches: [master, main]
+  pull_request:
+    branches: [master, main]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: Run tests
+        run: uv run pytest -v

scduck-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,30 @@
+# Python
+__pycache__/
+*.py[cod]
+*.so
+build/
+dist/
+*.egg-info/
+.eggs/
+# Virtual environments
+.venv/
+venv/
+.env
+# DuckDB
+*.duckdb
+*.duckdb.wal
+# IDE
+.idea/
+.vscode/
+*.swp
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+# OS
+.DS_Store

scduck-0.1.0/.python-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3.12

scduck-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,115 @@
+Metadata-Version: 2.4
+Name: scduck
+Version: 0.1.0
+Summary: SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data.
+Project-URL: Homepage, https://github.com/wolferesearch/scduck
+Project-URL: Repository, https://github.com/wolferesearch/scduck
+Author-email: Larry Chen <lchen@wolferesearch.com>
+License: MIT
+Keywords: data-warehouse,duckdb,history,scd,slowly-changing-dimension,temporal
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Database
+Requires-Python: >=3.10
+Requires-Dist: duckdb>=0.9.0
+Requires-Dist: pyarrow>=14.0.0
+Provides-Extra: all
+Requires-Dist: pandas>=2.0.0; extra == 'all'
+Requires-Dist: polars>=0.19.0; extra == 'all'
+Provides-Extra: dev
+Requires-Dist: pandas>=2.0.0; extra == 'dev'
+Requires-Dist: polars>=0.19.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
+Provides-Extra: pandas
+Requires-Dist: pandas>=2.0.0; extra == 'pandas'
+Provides-Extra: polars
+Requires-Dist: polars>=0.19.0; extra == 'polars'
+Description-Content-Type: text/markdown
+# scduck
+SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data without storing redundant snapshots.
+**13 days of data: 65 MB CSV -> 6.3 MB DuckDB (~10x compression)**
+## How it works
+Records are stored with `valid_from` / `valid_to` date ranges. When data doesn't change, no new rows are written. Only changes generate new records.
+```
+id   | name  | price | valid_from | valid_to
+P001 | Widget| 9.99  | 2025-01-01 | 2025-03-15  # original price
+P001 | Widget| 12.99 | 2025-03-15 | NULL        # price changed
+P002 | Gadget| 4.99  | 2025-01-01 | NULL        # unchanged
+```
+- `valid_from`: inclusive (>=)
+- `valid_to`: exclusive (<), NULL = current
+## Usage
+```python
+from scduck import SCDTable
+# Define your schema
+with SCDTable(
+    "products.duckdb",
+    table="products",
+    keys=["product_id"],
+    values=["name", "price", "category"]
+) as db:
+    # Sync daily snapshots (pandas, polars, or pyarrow)
+    result = db.sync("2025-01-01", df_jan1)  # returns SyncResult
+    db.sync("2025-01-02", df_jan2)
+    # Reconstruct any historical snapshot
+    snapshot = db.get_data("2025-01-01")  # returns pyarrow Table
+    # Check synced dates
+    db.get_synced_dates()  # ['2025-01-01', '2025-01-02']
+```
+### Out-of-order sync
+Dates can be synced in any order:
+```python
+db.sync("2025-01-15", df)  # sync Jan 15 first
+db.sync("2025-01-01", df)  # backfill Jan 1
+db.get_data("2025-01-01")  # returns correct snapshot
+```
+## Example: SecurityMaster
+```python
+import pandas as pd
+from scduck import SCDTable
+with SCDTable(
+    "security_master.duckdb",
+    table="securities",
+    keys=["security_id"],
+    values=["ticker", "mic", "isin", "description",
+            "sub_industry", "country", "currency", "country_risk"]
+) as db:
+    df = pd.read_csv("SecurityMaster_20251201.csv")
+    db.sync("2025-12-01", df)
+```
+## Installation
+```bash
+pip install scduck
+# With pandas/polars support
+pip install scduck[all]
+```
+## Sync Logic
+See [SYNC_LOGIC.md](SYNC_LOGIC.md) for detailed operation cases.

scduck-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,82 @@
+# scduck
+SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data without storing redundant snapshots.
+**13 days of data: 65 MB CSV -> 6.3 MB DuckDB (~10x compression)**
+## How it works
+Records are stored with `valid_from` / `valid_to` date ranges. When data doesn't change, no new rows are written. Only changes generate new records.
+```
+id   | name  | price | valid_from | valid_to
+P001 | Widget| 9.99  | 2025-01-01 | 2025-03-15  # original price
+P001 | Widget| 12.99 | 2025-03-15 | NULL        # price changed
+P002 | Gadget| 4.99  | 2025-01-01 | NULL        # unchanged
+```
+- `valid_from`: inclusive (>=)
+- `valid_to`: exclusive (<), NULL = current
+## Usage
+```python
+from scduck import SCDTable
+# Define your schema
+with SCDTable(
+    "products.duckdb",
+    table="products",
+    keys=["product_id"],
+    values=["name", "price", "category"]
+) as db:
+    # Sync daily snapshots (pandas, polars, or pyarrow)
+    result = db.sync("2025-01-01", df_jan1)  # returns SyncResult
+    db.sync("2025-01-02", df_jan2)
+    # Reconstruct any historical snapshot
+    snapshot = db.get_data("2025-01-01")  # returns pyarrow Table
+    # Check synced dates
+    db.get_synced_dates()  # ['2025-01-01', '2025-01-02']
+```
+### Out-of-order sync
+Dates can be synced in any order:
+```python
+db.sync("2025-01-15", df)  # sync Jan 15 first
+db.sync("2025-01-01", df)  # backfill Jan 1
+db.get_data("2025-01-01")  # returns correct snapshot
+```
+## Example: SecurityMaster
+```python
+import pandas as pd
+from scduck import SCDTable
+with SCDTable(
+    "security_master.duckdb",
+    table="securities",
+    keys=["security_id"],
+    values=["ticker", "mic", "isin", "description",
+            "sub_industry", "country", "currency", "country_risk"]
+) as db:
+    df = pd.read_csv("SecurityMaster_20251201.csv")
+    db.sync("2025-12-01", df)
+```
+## Installation
+```bash
+pip install scduck
+# With pandas/polars support
+pip install scduck[all]
+```
+## Sync Logic
+See [SYNC_LOGIC.md](SYNC_LOGIC.md) for detailed operation cases.

scduck-0.1.0/SYNC_LOGIC.md ADDED Viewed

@@ -0,0 +1,114 @@
+# Sync Logic
+This document describes how `SCDTable.sync()` handles each case when syncing a snapshot.
+## Model
+- `valid_from`: inclusive (>=), the date this record became effective
+- `valid_to`: exclusive (<), NULL means current/forever
+- Out-of-order sync supported
+- No separate presence tracking table needed
+## Sync Operations
+When `sync(date, df)` is called, each record falls into one of these cases:
+---
+### Record IS in incoming snapshot
+#### Case 1: Covered, same data
+**Condition**: Record exists where `valid_from <= date` AND (`valid_to > date` OR `valid_to IS NULL`), data identical
+**Operation**: No change
+---
+#### Case 2: Covered, different data
+**Condition**: Record covers date, data differs
+**Operation**:
+1. Close old record: `valid_to = date`
+2. Insert new record: `valid_from = date`, `valid_to = old_record.valid_to`
+---
+#### Case 3: Not covered, has NEXT record (valid_from > date)
+**3a. Same data**:
+- Extend next record backwards: `valid_from = date`
+**3b. Different data**:
+- Insert new record: `valid_from = date`, `valid_to = next.valid_from`
+---
+#### Case 4: Reappearance (not covered, no next, has prev record)
+**Operation**: Insert new record with:
+- `valid_from = date`
+- `valid_to = earliest synced date after this date where record has no coverage, or NULL`
+---
+#### Case 5: New record (no existing SCD records)
+**Operation**: Insert record with:
+- `valid_from = date`
+- `valid_to = earliest synced date after this date where record has no coverage, or NULL`
+---
+### Record NOT in incoming snapshot
+#### Case 6: Has covering record
+**Operation**:
+1. Close record: `valid_to = date`
+2. If there are synced dates after this date that were covered by the original record, re-open from the earliest such date
+---
+#### Case 7: No covering record
+**Operation**: No change
+---
+## Key Insight: Inferring Presence
+We don't need a separate presence tracking table because:
+1. **sync_metadata** tells us which dates have been synced
+2. **SCD records** tell us coverage ranges
+When a record covers a synced date and wasn't closed at that date, the record must have been present when that date was synced.
+Example:
+- Record (Dec 1, NULL) exists
+- sync_metadata shows Dec 5 was synced
+- If Dec 5 was synced and record still covers it, record was present at Dec 5
+- If record was absent at Dec 5, it would have been closed at Dec 5
+This allows correct handling of deletions during out-of-order sync without storing per-record presence.
+---
+## Example: Out-of-order with deletion
+```
+Sync Dec 17: Record X present
+  → Case 5: INSERT (Dec 17, NULL)
+Sync Dec 1: Record X present, same data
+  → Case 3a: Extend backwards (Dec 1, NULL)
+Sync Dec 5: Record X present
+  → Case 1: Covered, unchanged
+Sync Dec 3: Record X NOT present
+  → Case 6: Close at Dec 3: (Dec 1, Dec 3)
+  → Re-open from next synced date (Dec 5): INSERT (Dec 5, NULL)
+Result: (Dec 1, Dec 3), (Dec 5, NULL) ✓
+```

scduck-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,46 @@
+[project]
+name = "scduck"
+version = "0.1.0"
+description = "SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data."
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "MIT"}
+authors = [
+    {name = "Larry Chen", email = "lchen@wolferesearch.com"}
+]
+keywords = ["scd", "duckdb", "temporal", "history", "data-warehouse", "slowly-changing-dimension"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Database",
+]
+dependencies = [
+    "duckdb>=0.9.0",
+    "pyarrow>=14.0.0",
+]
+[project.optional-dependencies]
+pandas = ["pandas>=2.0.0"]
+polars = ["polars>=0.19.0"]
+all = ["pandas>=2.0.0", "polars>=0.19.0"]
+dev = ["pytest>=7.0.0", "pandas>=2.0.0", "polars>=0.19.0"]
+[project.urls]
+Homepage = "https://github.com/wolferesearch/scduck"
+Repository = "https://github.com/wolferesearch/scduck"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["scduck"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

scduck-0.1.0/scduck/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+scduck - SCD Type 2 tables with DuckDB.
+Track historical changes to slowly-changing data without storing redundant snapshots.
+"""
+from .table import DataFrameLike, SCDTable, SyncResult
+__all__ = ["SCDTable", "SyncResult", "DataFrameLike"]
+__version__ = "0.1.0"