scduck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v4
16
+
17
+ - name: Set up Python
18
+ run: uv python install 3.12
19
+
20
+ - name: Install dependencies
21
+ run: uv sync --extra dev
22
+
23
+ - name: Run tests
24
+ run: uv run pytest -v
25
+
26
+ publish:
27
+ needs: test
28
+ runs-on: ubuntu-latest
29
+ environment: pypi
30
+ permissions:
31
+ id-token: write # Required for trusted publishing
32
+
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+
36
+ - name: Install uv
37
+ uses: astral-sh/setup-uv@v4
38
+
39
+ - name: Set up Python
40
+ run: uv python install 3.12
41
+
42
+ - name: Build package
43
+ run: uv build
44
+
45
+ - name: Publish to PyPI
46
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,29 @@
1
+ name: Test
2
+
3
+ on:
4
+ push:
5
+ branches: [master, main]
6
+ pull_request:
7
+ branches: [master, main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ run: uv python install ${{ matrix.python-version }}
24
+
25
+ - name: Install dependencies
26
+ run: uv sync --extra dev
27
+
28
+ - name: Run tests
29
+ run: uv run pytest -v
@@ -0,0 +1,30 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+ build/
6
+ dist/
7
+ *.egg-info/
8
+ .eggs/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ .env
14
+
15
+ # DuckDB
16
+ *.duckdb
17
+ *.duckdb.wal
18
+
19
+ # IDE
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+
29
+ # OS
30
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.12
scduck-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: scduck
3
+ Version: 0.1.0
4
+ Summary: SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data.
5
+ Project-URL: Homepage, https://github.com/wolferesearch/scduck
6
+ Project-URL: Repository, https://github.com/wolferesearch/scduck
7
+ Author-email: Larry Chen <lchen@wolferesearch.com>
8
+ License: MIT
9
+ Keywords: data-warehouse,duckdb,history,scd,slowly-changing-dimension,temporal
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Database
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: duckdb>=0.9.0
20
+ Requires-Dist: pyarrow>=14.0.0
21
+ Provides-Extra: all
22
+ Requires-Dist: pandas>=2.0.0; extra == 'all'
23
+ Requires-Dist: polars>=0.19.0; extra == 'all'
24
+ Provides-Extra: dev
25
+ Requires-Dist: pandas>=2.0.0; extra == 'dev'
26
+ Requires-Dist: polars>=0.19.0; extra == 'dev'
27
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
28
+ Provides-Extra: pandas
29
+ Requires-Dist: pandas>=2.0.0; extra == 'pandas'
30
+ Provides-Extra: polars
31
+ Requires-Dist: polars>=0.19.0; extra == 'polars'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # scduck
35
+
36
+ SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data without storing redundant snapshots.
37
+
38
+ **13 days of data: 65 MB CSV -> 6.3 MB DuckDB (~10x compression)**
39
+
40
+ ## How it works
41
+
42
+ Records are stored with `valid_from` / `valid_to` date ranges. When data doesn't change, no new rows are written. Only changes generate new records.
43
+
44
+ ```
45
+ id | name | price | valid_from | valid_to
46
+ P001 | Widget| 9.99 | 2025-01-01 | 2025-03-15 # original price
47
+ P001 | Widget| 12.99 | 2025-03-15 | NULL # price changed
48
+ P002 | Gadget| 4.99 | 2025-01-01 | NULL # unchanged
49
+ ```
50
+
51
+ - `valid_from`: inclusive (>=)
52
+ - `valid_to`: exclusive (<), NULL = current
53
+
54
+ ## Usage
55
+
56
+ ```python
57
+ from scduck import SCDTable
58
+
59
+ # Define your schema
60
+ with SCDTable(
61
+ "products.duckdb",
62
+ table="products",
63
+ keys=["product_id"],
64
+ values=["name", "price", "category"]
65
+ ) as db:
66
+ # Sync daily snapshots (pandas, polars, or pyarrow)
67
+ result = db.sync("2025-01-01", df_jan1) # returns SyncResult
68
+ db.sync("2025-01-02", df_jan2)
69
+
70
+ # Reconstruct any historical snapshot
71
+ snapshot = db.get_data("2025-01-01") # returns pyarrow Table
72
+
73
+ # Check synced dates
74
+ db.get_synced_dates() # ['2025-01-01', '2025-01-02']
75
+ ```
76
+
77
+ ### Out-of-order sync
78
+
79
+ Dates can be synced in any order:
80
+
81
+ ```python
82
+ db.sync("2025-01-15", df) # sync Jan 15 first
83
+ db.sync("2025-01-01", df) # backfill Jan 1
84
+ db.get_data("2025-01-01") # returns correct snapshot
85
+ ```
86
+
87
+ ## Example: SecurityMaster
88
+
89
+ ```python
90
+ import pandas as pd
91
+ from scduck import SCDTable
92
+
93
+ with SCDTable(
94
+ "security_master.duckdb",
95
+ table="securities",
96
+ keys=["security_id"],
97
+ values=["ticker", "mic", "isin", "description",
98
+ "sub_industry", "country", "currency", "country_risk"]
99
+ ) as db:
100
+ df = pd.read_csv("SecurityMaster_20251201.csv")
101
+ db.sync("2025-12-01", df)
102
+ ```
103
+
104
+ ## Installation
105
+
106
+ ```bash
107
+ pip install scduck
108
+
109
+ # With pandas/polars support
110
+ pip install scduck[all]
111
+ ```
112
+
113
+ ## Sync Logic
114
+
115
+ See [SYNC_LOGIC.md](SYNC_LOGIC.md) for detailed operation cases.
scduck-0.1.0/README.md ADDED
@@ -0,0 +1,82 @@
1
+ # scduck
2
+
3
+ SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data without storing redundant snapshots.
4
+
5
+ **13 days of data: 65 MB CSV -> 6.3 MB DuckDB (~10x compression)**
6
+
7
+ ## How it works
8
+
9
+ Records are stored with `valid_from` / `valid_to` date ranges. When data doesn't change, no new rows are written. Only changes generate new records.
10
+
11
+ ```
12
+ id | name | price | valid_from | valid_to
13
+ P001 | Widget| 9.99 | 2025-01-01 | 2025-03-15 # original price
14
+ P001 | Widget| 12.99 | 2025-03-15 | NULL # price changed
15
+ P002 | Gadget| 4.99 | 2025-01-01 | NULL # unchanged
16
+ ```
17
+
18
+ - `valid_from`: inclusive (>=)
19
+ - `valid_to`: exclusive (<), NULL = current
20
+
21
+ ## Usage
22
+
23
+ ```python
24
+ from scduck import SCDTable
25
+
26
+ # Define your schema
27
+ with SCDTable(
28
+ "products.duckdb",
29
+ table="products",
30
+ keys=["product_id"],
31
+ values=["name", "price", "category"]
32
+ ) as db:
33
+ # Sync daily snapshots (pandas, polars, or pyarrow)
34
+ result = db.sync("2025-01-01", df_jan1) # returns SyncResult
35
+ db.sync("2025-01-02", df_jan2)
36
+
37
+ # Reconstruct any historical snapshot
38
+ snapshot = db.get_data("2025-01-01") # returns pyarrow Table
39
+
40
+ # Check synced dates
41
+ db.get_synced_dates() # ['2025-01-01', '2025-01-02']
42
+ ```
43
+
44
+ ### Out-of-order sync
45
+
46
+ Dates can be synced in any order:
47
+
48
+ ```python
49
+ db.sync("2025-01-15", df) # sync Jan 15 first
50
+ db.sync("2025-01-01", df) # backfill Jan 1
51
+ db.get_data("2025-01-01") # returns correct snapshot
52
+ ```
53
+
54
+ ## Example: SecurityMaster
55
+
56
+ ```python
57
+ import pandas as pd
58
+ from scduck import SCDTable
59
+
60
+ with SCDTable(
61
+ "security_master.duckdb",
62
+ table="securities",
63
+ keys=["security_id"],
64
+ values=["ticker", "mic", "isin", "description",
65
+ "sub_industry", "country", "currency", "country_risk"]
66
+ ) as db:
67
+ df = pd.read_csv("SecurityMaster_20251201.csv")
68
+ db.sync("2025-12-01", df)
69
+ ```
70
+
71
+ ## Installation
72
+
73
+ ```bash
74
+ pip install scduck
75
+
76
+ # With pandas/polars support
77
+ pip install scduck[all]
78
+ ```
79
+
80
+ ## Sync Logic
81
+
82
+ See [SYNC_LOGIC.md](SYNC_LOGIC.md) for detailed operation cases.
@@ -0,0 +1,114 @@
1
+ # Sync Logic
2
+
3
+ This document describes how `SCDTable.sync()` handles each case when syncing a snapshot.
4
+
5
+ ## Model
6
+
7
+ - `valid_from`: inclusive (>=), the date this record became effective
8
+ - `valid_to`: exclusive (<), NULL means current/forever
9
+ - Out-of-order sync supported
10
+ - No separate presence tracking table needed
11
+
12
+ ## Sync Operations
13
+
14
+ When `sync(date, df)` is called, each record falls into one of these cases:
15
+
16
+ ---
17
+
18
+ ### Record IS in incoming snapshot
19
+
20
+ #### Case 1: Covered, same data
21
+ **Condition**: Record exists where `valid_from <= date` AND (`valid_to > date` OR `valid_to IS NULL`), data identical
22
+
23
+ **Operation**: No change
24
+
25
+ ---
26
+
27
+ #### Case 2: Covered, different data
28
+ **Condition**: Record covers date, data differs
29
+
30
+ **Operation**:
31
+ 1. Close old record: `valid_to = date`
32
+ 2. Insert new record: `valid_from = date`, `valid_to = old_record.valid_to`
33
+
34
+ ---
35
+
36
+ #### Case 3: Not covered, has NEXT record (valid_from > date)
37
+
38
+ **3a. Same data**:
39
+ - Extend next record backwards: `valid_from = date`
40
+
41
+ **3b. Different data**:
42
+ - Insert new record: `valid_from = date`, `valid_to = next.valid_from`
43
+
44
+ ---
45
+
46
+ #### Case 4: Reappearance (not covered, no next, has prev record)
47
+
48
+ **Operation**: Insert new record with:
49
+ - `valid_from = date`
50
+ - `valid_to = earliest synced date after this date where record has no coverage, or NULL`
51
+
52
+ ---
53
+
54
+ #### Case 5: New record (no existing SCD records)
55
+
56
+ **Operation**: Insert record with:
57
+ - `valid_from = date`
58
+ - `valid_to = earliest synced date after this date where record has no coverage, or NULL`
59
+
60
+ ---
61
+
62
+ ### Record NOT in incoming snapshot
63
+
64
+ #### Case 6: Has covering record
65
+
66
+ **Operation**:
67
+ 1. Close record: `valid_to = date`
68
+ 2. If there are synced dates after this date that were covered by the original record, re-open from the earliest such date
69
+
70
+ ---
71
+
72
+ #### Case 7: No covering record
73
+
74
+ **Operation**: No change
75
+
76
+ ---
77
+
78
+ ## Key Insight: Inferring Presence
79
+
80
+ We don't need a separate presence tracking table because:
81
+
82
+ 1. **sync_metadata** tells us which dates have been synced
83
+ 2. **SCD records** tell us coverage ranges
84
+
85
+ When a record covers a synced date and wasn't closed at that date, the record must have been present when that date was synced.
86
+
87
+ Example:
88
+ - Record (Dec 1, NULL) exists
89
+ - sync_metadata shows Dec 5 was synced
90
+ - If Dec 5 was synced and record still covers it, record was present at Dec 5
91
+ - If record was absent at Dec 5, it would have been closed at Dec 5
92
+
93
+ This allows correct handling of deletions during out-of-order sync without storing per-record presence.
94
+
95
+ ---
96
+
97
+ ## Example: Out-of-order with deletion
98
+
99
+ ```
100
+ Sync Dec 17: Record X present
101
+ → Case 5: INSERT (Dec 17, NULL)
102
+
103
+ Sync Dec 1: Record X present, same data
104
+ → Case 3a: Extend backwards (Dec 1, NULL)
105
+
106
+ Sync Dec 5: Record X present
107
+ → Case 1: Covered, unchanged
108
+
109
+ Sync Dec 3: Record X NOT present
110
+ → Case 6: Close at Dec 3: (Dec 1, Dec 3)
111
+ → Re-open from next synced date (Dec 5): INSERT (Dec 5, NULL)
112
+
113
+ Result: (Dec 1, Dec 3), (Dec 5, NULL) ✓
114
+ ```
@@ -0,0 +1,46 @@
1
+ [project]
2
+ name = "scduck"
3
+ version = "0.1.0"
4
+ description = "SCD Type 2 tables with DuckDB. Track historical changes to slowly-changing data."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Larry Chen", email = "lchen@wolferesearch.com"}
10
+ ]
11
+ keywords = ["scd", "duckdb", "temporal", "history", "data-warehouse", "slowly-changing-dimension"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Database",
21
+ ]
22
+ dependencies = [
23
+ "duckdb>=0.9.0",
24
+ "pyarrow>=14.0.0",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ pandas = ["pandas>=2.0.0"]
29
+ polars = ["polars>=0.19.0"]
30
+ all = ["pandas>=2.0.0", "polars>=0.19.0"]
31
+ dev = ["pytest>=7.0.0", "pandas>=2.0.0", "polars>=0.19.0"]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/wolferesearch/scduck"
35
+ Repository = "https://github.com/wolferesearch/scduck"
36
+
37
+ [build-system]
38
+ requires = ["hatchling"]
39
+ build-backend = "hatchling.build"
40
+
41
+ [tool.hatch.build.targets.wheel]
42
+ packages = ["scduck"]
43
+
44
+ [tool.pytest.ini_options]
45
+ testpaths = ["tests"]
46
+ pythonpath = ["."]
@@ -0,0 +1,10 @@
1
+ """
2
+ scduck - SCD Type 2 tables with DuckDB.
3
+
4
+ Track historical changes to slowly-changing data without storing redundant snapshots.
5
+ """
6
+
7
+ from .table import DataFrameLike, SCDTable, SyncResult
8
+
9
+ __all__ = ["SCDTable", "SyncResult", "DataFrameLike"]
10
+ __version__ = "0.1.0"